[ExecuTorch][WebGPU] Add mul op with full broadcast (aten.mul.Tensor)

JulianCloudNTH · JulianCloudNTH · commit f6945a4b8dc7 · 2026-06-18T09:07:14.000-07:00
Pull Request resolved: #20358 Adds `aten.mul.Tensor` to the WebGPU delegate with full PyTorch broadcast, plus the shared `runtime/ops/TensorMeta.h` per-tensor uniform that broadcast ops reuse. Mul is on the Llama critical path — `F.silu` decomposes to `sigmoid` + `mul`, and SwiGLU multiplies two same-shape activations (the fast path). Composition (single dispatch): - `TensorMeta.h` (NEW) — 48-byte std140 `{ndim, numel, sizes[4], strides[4]}` UBO mirroring Vulkan's per-tensor `BufferMetadata`; `fill_tensor_meta_broadcast` right-aligns operand dims (rank>4 throws); `static_assert(sizeof==48)`. - `mul/BinaryOp.cpp` — builds 3 `TensorMeta` UBOs (out/in1/in2 at bindings 3/4/5), guards fp32 + rank≤4, 1D-dispatches over `compute_1d_workgroup_count(numel)`, releases all uniforms after the bind group. - `mul/binary_mul.wgsl` — same-shape fast path + a broadcast path (delinearize output index, clamp each input coord per-dim to size-1, relinearize on input strides). - `WebGPUUtils.h` — adds the shared `utils::make_uniform` helper (first use). ghstack-source-id: 394848336 @exported-using-ghexport Differential Revision: [D108793167](https://our.internmc.facebook.com/intern/diff/D108793167/)
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
@@ -38,6 +38,7 @@ set(WEBGPU_SRCS
     runtime/ops/sdpa/Sdpa.cpp
     runtime/ops/select_as_symint/SelectAsSymint.cpp
     runtime/ops/quantized_linear/QuantizedLinear.cpp
+    runtime/ops/mul/BinaryOp.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/WebGPUUtils.h b/backends/webgpu/runtime/WebGPUUtils.h
@@ -12,6 +12,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <cstring>
 #include <stdexcept>
 #include <string>
 
@@ -48,4 +49,25 @@ inline uint32_t compute_1d_workgroup_count(
   return count;
 }
 
+// Create a uniform buffer mapped-at-creation, copy `size` bytes in, and unmap.
+inline WGPUBuffer
+make_uniform(WGPUDevice device, const void* data, size_t size) {
+  WGPUBufferDescriptor desc = {};
+  desc.size = size;
+  desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+  desc.mappedAtCreation = true;
+  WGPUBuffer buf = wgpuDeviceCreateBuffer(device, &desc);
+  if (!buf) {
+    throw std::runtime_error("make_uniform: buffer creation failed");
+  }
+  void* ptr = wgpuBufferGetMappedRange(buf, 0, size);
+  if (!ptr) {
+    wgpuBufferRelease(buf);
+    throw std::runtime_error("make_uniform: mapped range is null");
+  }
+  std::memcpy(ptr, data, size);
+  wgpuBufferUnmap(buf);
+  return buf;
+}
+
 } // namespace executorch::backends::webgpu::utils
diff --git a/backends/webgpu/runtime/ops/TensorMeta.h b/backends/webgpu/runtime/ops/TensorMeta.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+
+namespace executorch::backends::webgpu {
+
+constexpr uint32_t kTensorMetaMaxNdim = 4;
+
+// Per-tensor metadata UBO; mirrors Vulkan BufferMetadata (4-dim NCHW, std140).
+struct TensorMeta {
+  uint32_t ndim;
+  uint32_t numel;
+  uint32_t _pad[2];
+  uint32_t sizes[kTensorMetaMaxNdim];
+  uint32_t strides[kTensorMetaMaxNdim];
+};
+
+static_assert(
+    sizeof(TensorMeta) == 48,
+    "TensorMeta std140 layout must be 48 bytes to match the WGSL uniform");
+// Lock the std140 field offsets the WGSL uniform reads, not just total size.
+static_assert(offsetof(TensorMeta, ndim) == 0);
+static_assert(offsetof(TensorMeta, numel) == 4);
+static_assert(offsetof(TensorMeta, sizes) == 16);
+static_assert(offsetof(TensorMeta, strides) == 32);
+
+// Fill TensorMeta from NCHW dims: contiguous strides, padded trailing slots.
+inline void fill_tensor_meta(const WebGPUTensor& t, TensorMeta* m) {
+  const uint32_t ndim = static_cast<uint32_t>(t.dims.size());
+  if (ndim > kTensorMetaMaxNdim) {
+    throw std::runtime_error("TensorMeta: tensor rank exceeds 4 (MAX_NDIM)");
+  }
+  *m = {};
+  for (uint32_t d = 0; d < kTensorMetaMaxNdim; d++) {
+    m->sizes[d] = 1u;
+    m->strides[d] = 0u;
+  }
+  m->ndim = ndim;
+  uint32_t numel = 1u;
+  uint32_t acc = 1u;
+  for (int i = static_cast<int>(ndim) - 1; i >= 0; i--) {
+    const uint32_t sz = static_cast<uint32_t>(t.dims[i]);
+    m->sizes[i] = sz;
+    m->strides[i] = acc;
+    acc *= sz;
+    numel *= sz;
+  }
+  m->numel = numel;
+}
+
+// Broadcast variant: right-align operand dims into out rank (PyTorch trailing).
+inline void fill_tensor_meta_broadcast(
+    const WebGPUTensor& t,
+    uint32_t out_ndim,
+    TensorMeta* m) {
+  const uint32_t rank = static_cast<uint32_t>(t.dims.size());
+  if (out_ndim > kTensorMetaMaxNdim) {
+    throw std::runtime_error("TensorMeta: out_ndim exceeds 4 (MAX_NDIM)");
+  }
+  if (rank > out_ndim) {
+    throw std::runtime_error("TensorMeta: operand rank exceeds out_ndim");
+  }
+  *m = {};
+  for (uint32_t d = 0; d < kTensorMetaMaxNdim; d++) {
+    m->sizes[d] = 1u;
+    m->strides[d] = 0u;
+  }
+  m->ndim = out_ndim;
+  uint32_t acc = 1u;
+  uint32_t numel = 1u;
+  for (int i = static_cast<int>(rank) - 1; i >= 0; i--) {
+    const uint32_t slot = out_ndim - rank + static_cast<uint32_t>(i);
+    const uint32_t sz = static_cast<uint32_t>(t.dims[i]);
+    m->sizes[slot] = sz;
+    m->strides[slot] = acc;
+    acc *= sz;
+    numel *= sz;
+  }
+  m->numel = numel;
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/mul/BinaryOp.cpp b/backends/webgpu/runtime/ops/mul/BinaryOp.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/TensorMeta.h>
+#include <executorch/backends/webgpu/runtime/ops/mul/binary_mul_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <stdexcept>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+void mul_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // aten.mul.Tensor args: [in1, in2, out] (self, other; no alpha)
+  const int in1_id = args.at(0);
+  const int in2_id = args.at(1);
+  const int out_id = args.at(2);
+
+  WGPUDevice device = graph.device();
+
+  const auto& in1_tensor = graph.get_tensor(in1_id);
+  const auto& in2_tensor = graph.get_tensor(in2_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+
+  // Rank guard (NCHW backend is <= 4 dims; 1D dispatch only).
+  if (out_tensor.dims.size() > kTensorMetaMaxNdim ||
+      in1_tensor.dims.size() > kTensorMetaMaxNdim ||
+      in2_tensor.dims.size() > kTensorMetaMaxNdim) {
+    throw std::runtime_error("mul: tensor rank exceeds 4 (MAX_NDIM)");
+  }
+
+  const uint32_t out_ndim = static_cast<uint32_t>(out_tensor.dims.size());
+
+  // 3 per-tensor meta uniforms (mirror Vulkan); inputs broadcast-aligned.
+  TensorMeta out_meta;
+  TensorMeta in1_meta;
+  TensorMeta in2_meta;
+  fill_tensor_meta_broadcast(out_tensor, out_ndim, &out_meta);
+  fill_tensor_meta_broadcast(in1_tensor, out_ndim, &in1_meta);
+  fill_tensor_meta_broadcast(in2_tensor, out_ndim, &in2_meta);
+
+  // fp32-only: nbytes must equal numel * 4 for every operand.
+  if (out_tensor.nbytes !=
+          static_cast<size_t>(out_meta.numel) * sizeof(float) ||
+      in1_tensor.nbytes !=
+          static_cast<size_t>(in1_meta.numel) * sizeof(float) ||
+      in2_tensor.nbytes !=
+          static_cast<size_t>(in2_meta.numel) * sizeof(float)) {
+    throw std::runtime_error("mul: non-fp32 operand (nbytes != numel * 4)");
+  }
+
+  uint32_t wg_size =
+      utils::clamp_workgroup_size(device, kBinaryMulWorkgroupSizeX);
+  uint32_t workgroup_count =
+      utils::compute_1d_workgroup_count(device, out_meta.numel, wg_size, "mul");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  WGPUBuffer out_meta_buf =
+      utils::make_uniform(device, &out_meta, sizeof(TensorMeta));
+  WGPUBuffer in1_meta_buf =
+      utils::make_uniform(device, &in1_meta, sizeof(TensorMeta));
+  WGPUBuffer in2_meta_buf =
+      utils::make_uniform(device, &in2_meta, sizeof(TensorMeta));
+  graph.add_uniform_buffer_bytes(3 * sizeof(TensorMeta));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kBinaryMulWGSL, WGPU_STRLEN};
+
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Bind group: in1, in2, out (rw), out_meta, in1_meta, in2_meta (3 uniforms).
+  WGPUBindGroupLayoutEntry entries[6] = {};
+
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Storage;
+
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+
+  entries[4].binding = 4;
+  entries[4].visibility = WGPUShaderStage_Compute;
+  entries[4].buffer.type = WGPUBufferBindingType_Uniform;
+
+  entries[5].binding = 5;
+  entries[5].visibility = WGPUShaderStage_Compute;
+  entries[5].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 6;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[6] = {};
+
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in1_tensor.buffer;
+  bg_entries[0].size = in1_tensor.nbytes;
+
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = in2_tensor.buffer;
+  bg_entries[1].size = in2_tensor.nbytes;
+
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = out_tensor.buffer;
+  bg_entries[2].size = out_tensor.nbytes;
+
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = out_meta_buf;
+  bg_entries[3].size = sizeof(TensorMeta);
+
+  bg_entries[4].binding = 4;
+  bg_entries[4].buffer = in1_meta_buf;
+  bg_entries[4].size = sizeof(TensorMeta);
+
+  bg_entries[5].binding = 5;
+  bg_entries[5].buffer = in2_meta_buf;
+  bg_entries[5].size = sizeof(TensorMeta);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 6;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // Drop our refs; the bind group keeps the uniforms alive until release.
+  wgpuBufferRelease(out_meta_buf);
+  wgpuBufferRelease(in1_meta_buf);
+  wgpuBufferRelease(in2_meta_buf);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.mul.Tensor, mul_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/mul/binary_mul.wgsl b/backends/webgpu/runtime/ops/mul/binary_mul.wgsl
@@ -0,0 +1,48 @@
+@group(0) @binding(0) var<storage, read> input1: array<f32>;
+@group(0) @binding(1) var<storage, read> input2: array<f32>;
+@group(0) @binding(2) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(3) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(4) var<uniform> in1_meta: TensorMeta;
+@group(0) @binding(5) var<uniform> in2_meta: TensorMeta;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= out_meta.numel) {
+        return;
+    }
+
+    // Fast path: every input dim matches the output dim -> elementwise.
+    var same = true;
+    for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) {
+        if (in1_meta.sizes[d] != out_meta.sizes[d] ||
+            in2_meta.sizes[d] != out_meta.sizes[d]) {
+            same = false;
+        }
+    }
+    if (same) {
+        output[idx] = input1[idx] * input2[idx];
+        return;
+    }
+
+    // Broadcast: out idx -> per-input coord (clamp size-1 dims), relinearize.
+    var rem = idx;
+    var l1: u32 = 0u;
+    var l2: u32 = 0u;
+    for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) {
+        let coord = rem / out_meta.strides[d];
+        rem = rem % out_meta.strides[d];
+        l1 = l1 + min(coord, in1_meta.sizes[d] - 1u) * in1_meta.strides[d];
+        l2 = l2 + min(coord, in2_meta.sizes[d] - 1u) * in2_meta.strides[d];
+    }
+    output[idx] = input1[l1] * input2[l2];
+}
diff --git a/backends/webgpu/runtime/ops/mul/binary_mul_wgsl.h b/backends/webgpu/runtime/ops/mul/binary_mul_wgsl.h

Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ set(WEBGPU_SRCS`
`38`	`38`	`runtime/ops/sdpa/Sdpa.cpp`
`39`	`39`	`runtime/ops/select_as_symint/SelectAsSymint.cpp`
`40`	`40`	`runtime/ops/quantized_linear/QuantizedLinear.cpp`
	`41`	`+ runtime/ops/mul/BinaryOp.cpp`
`41`	`42`	`)`
`42`	`43`
`43`	`44`	`add_library(webgpu_backend ${WEBGPU_SRCS})`