Generate *_wgsl.h embedded shaders from *.wgsl (#19981)

JulianCloudNTH · facebook-github-bot · commit c36f9fa522fa · 2026-06-05T09:41:06.000-07:00
Summary:

Adds `backends/webgpu/scripts/gen_wgsl_headers.py` to generate each `runtime/ops/&lt;op&gt;/&lt;shader&gt;_wgsl.h` from its `&lt;shader&gt;.wgsl`, so each WGSL shader has a single canonical source instead of a hand-maintained embedded copy that can silently drift. Each header embeds the shader verbatim (`inline constexpr const char* k&lt;Op&gt;WGSL = R"(...)";` plus the workgroup-size constants) and a `// wgsl-sha256:` of the source; `--check` (wired into `test_build_webgpu.sh` and the `webgpu_backend` CMake build) and the unit tests fail the build if any committed header drifts.

`workgroup_size` is parsed for all three dims (WGSL allows 1-3; y and z default to 1), emitting `k&lt;Op&gt;WorkgroupSizeX/Y/Z` so future 2D/3D shaders need no codegen change; the two current 1D consumers read `...X`. The X/Y/Z naming and `uint32_t`-per-axis mirror Vulkan's `utils::WorkgroupSize` (`backends/vulkan/runtime/utils/VecUtils.h`); WGSL `workgroup_size` is compile-time, so the value is parsed from the shader rather than set via runtime spec-constants as in Vulkan. The drift check compares the full rendered header (not just the shader sha), so a generator-logic change is also detected/regenerated. The parser accepts the spaced form `workgroup_size (n)` and suffix-typed literals (`64u`).

Regenerates the two existing committed op headers: `binary_add_wgsl.h` and `rms_norm_wgsl.h` gain the `...X/Y/Z` constants (X = the 1D size, Y=Z=1); `rms_norm.wgsl` also drops its now-obsolete 3-line "keep in sync by hand" note (codegen + `--check` make it false). The shader code itself is unchanged.

This change was authored with assistance from Claude.

Reviewed By: SS-JIA

Differential Revision: D107403275
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
@@ -37,6 +37,17 @@ set(WEBGPU_SRCS
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
 
+# Verify committed *_wgsl.h match their *.wgsl (drift fails the build).
+resolve_python_executable()
+add_custom_target(
+  webgpu_wgsl_headers_check ALL
+  COMMAND "${PYTHON_EXECUTABLE}"
+          "${CMAKE_CURRENT_SOURCE_DIR}/scripts/gen_wgsl_headers.py" --check
+  WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+  COMMENT "Checking WebGPU embedded-WGSL headers are in sync"
+)
+add_dependencies(webgpu_backend webgpu_wgsl_headers_check)
+
 target_include_directories(
   webgpu_backend PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
 )
diff --git a/backends/webgpu/runtime/ops/add/BinaryOp.cpp b/backends/webgpu/runtime/ops/add/BinaryOp.cpp
@@ -52,7 +52,7 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
       static_cast<uint32_t>(out_tensor.nbytes / sizeof(float));
 
   uint32_t wg_size =
-      utils::clamp_workgroup_size(device, kBinaryAddWorkgroupSize);
+      utils::clamp_workgroup_size(device, kBinaryAddWorkgroupSizeX);
   uint32_t workgroup_count =
       utils::compute_1d_workgroup_count(device, num_elements, wg_size, "add");
 
diff --git a/backends/webgpu/runtime/ops/add/binary_add_wgsl.h b/backends/webgpu/runtime/ops/add/binary_add_wgsl.h
@@ -8,11 +8,12 @@
 
 #pragma once
 
-namespace executorch {
-namespace backends {
-namespace webgpu {
+#include <cstdint>
 
-// WGSL shader source for element-wise add: output = input1 + alpha * input2
+namespace executorch::backends::webgpu {
+
+// @generated from binary_add.wgsl - DO NOT EDIT.
+// wgsl-sha256: c1ceec80c8d4d3d56986ad91ce0d7f9a57cd8467b8c3aa07a28da70e51d141d9
 inline constexpr const char* kBinaryAddWGSL = R"(
 @group(0) @binding(0) var<storage, read> input1: array<f32>;
 @group(0) @binding(1) var<storage, read> input2: array<f32>;
@@ -36,8 +37,8 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 }
 )";
 
-inline constexpr uint32_t kBinaryAddWorkgroupSize = 256;
+inline constexpr uint32_t kBinaryAddWorkgroupSizeX = 256;
+inline constexpr uint32_t kBinaryAddWorkgroupSizeY = 1;
+inline constexpr uint32_t kBinaryAddWorkgroupSizeZ = 1;
 
-} // namespace webgpu
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/rms_norm/RmsNorm.cpp b/backends/webgpu/runtime/ops/rms_norm/RmsNorm.cpp
@@ -172,9 +172,9 @@ void rms_norm_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   bg_desc.entries = bg_entries;
   WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
 
-  // One workgroup per row (kRmsNormWorkgroupSize threads cooperate per row)
+  // One workgroup per row (kRmsNormWorkgroupSizeX threads cooperate per row)
   static_assert(
-      kRmsNormWorkgroupSize == 64,
+      kRmsNormWorkgroupSizeX == 64,
       "must match @workgroup_size and WG_SIZE in rms_norm.wgsl");
   graph.add_dispatch({pipeline, bind_group, num_rows});
 
diff --git a/backends/webgpu/runtime/ops/rms_norm/rms_norm.wgsl b/backends/webgpu/runtime/ops/rms_norm/rms_norm.wgsl
@@ -1,6 +1,3 @@
-// NOTE: This file is for editor/tooling support only. The runtime consumes the
-// inline copy of this shader in `rms_norm_wgsl.h` (kRmsNormWGSL). Keep the two
-// in sync by hand — any edit here must be mirrored there.
 @group(0) @binding(0) var<storage, read_write> t_out: array<f32>;
 @group(0) @binding(1) var<storage, read> t_in: array<f32>;
 @group(0) @binding(2) var<storage, read> t_weight: array<f32>;
diff --git a/backends/webgpu/runtime/ops/rms_norm/rms_norm_wgsl.h b/backends/webgpu/runtime/ops/rms_norm/rms_norm_wgsl.h
@@ -12,12 +12,8 @@
 
 namespace executorch::backends::webgpu {
 
-// WGSL shader source for rms_norm: y = x * w * rsqrt(mean(x^2) + eps)
-//
-// NOTE: This inline string is the runtime source of truth — it is what gets
-// passed to wgpuDeviceCreateShaderModule. The sibling `rms_norm.wgsl` file
-// exists only for editor/tooling support and must be kept identical to this
-// string by hand; there is no build-time sync.
+// @generated from rms_norm.wgsl - DO NOT EDIT.
+// wgsl-sha256: 340dcbf3c06dc311e70bef953c1e9cbbdf4121fe177eedd3253549e614b55069
 inline constexpr const char* kRmsNormWGSL = R"(
 @group(0) @binding(0) var<storage, read_write> t_out: array<f32>;
 @group(0) @binding(1) var<storage, read> t_in: array<f32>;
@@ -93,6 +89,8 @@ fn main(
 }
 )";
 
-inline constexpr uint32_t kRmsNormWorkgroupSize = 64;
+inline constexpr uint32_t kRmsNormWorkgroupSizeX = 64;
+inline constexpr uint32_t kRmsNormWorkgroupSizeY = 1;
+inline constexpr uint32_t kRmsNormWorkgroupSizeZ = 1;
 
 } // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/scripts/gen_wgsl_headers.py b/backends/webgpu/scripts/gen_wgsl_headers.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Generate runtime/ops/<op>/<stem>_wgsl.h from each <stem>.wgsl.
+
+Each header embeds the shader verbatim as `inline constexpr const char*
+k<Pascal>WGSL` plus `k<Pascal>WorkgroupSize` (parsed from @workgroup_size).
+
+Usage:
+  gen_wgsl_headers.py            # (re)write all <stem>_wgsl.h
+  gen_wgsl_headers.py --check    # exit 1 if any committed header is stale
+
+Stdlib only (the devserver has no third-party pip).
+"""
+
+import argparse
+import hashlib
+import re
+import sys
+from pathlib import Path
+
+BACKEND_ROOT = Path(__file__).resolve().parents[1]
+
+_SHA_RE = re.compile(r"// wgsl-sha256: ([0-9a-f]{64})")
+
+_BSD_HEADER = """\
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */"""
+
+
+def symbol_base(stem: str) -> str:
+    """snake_case shader stem -> PascalCase symbol base (binary_add -> BinaryAdd)."""
+    return "".join(part.capitalize() for part in stem.split("_"))
+
+
+_INT_LITERAL_RE = re.compile(r"^(\d+)[uUiI]?$")
+
+
+def _resolve_dim(tok: str, src: str) -> int:
+    """Resolve one @workgroup_size dim token: a literal or an override/const ident.
+
+    Accepts WGSL suffix-typed integer literals (e.g. `64u`, `64i`) both as the
+    token and on the right-hand side of an `override`/`const` (type optional).
+    """
+    lit = _INT_LITERAL_RE.match(tok)
+    if lit:
+        return int(lit.group(1))
+    m = re.search(
+        r"(?:override|const)\s+"
+        + re.escape(tok)
+        + r"\s*(?::\s*u32\s*)?=\s*(\d+)[uUiI]?",
+        src,
+    )
+    if not m:
+        raise ValueError(f"cannot resolve @workgroup_size identifier '{tok}'")
+    return int(m.group(1))
+
+
+def parse_workgroup_size(src: str) -> tuple[int, int, int]:
+    """Resolve the (x, y, z) dims of @workgroup_size; y and z default to 1."""
+    m = re.search(r"@workgroup_size\s*\(([^)]*)\)", src)
+    if not m:
+        raise ValueError("no @workgroup_size found")
+    toks = [t.strip() for t in m.group(1).split(",") if t.strip()]
+    if not toks or len(toks) > 3:
+        raise ValueError(f"@workgroup_size takes 1-3 dims, got {len(toks)}")
+    dims = [_resolve_dim(t, src) for t in toks]
+    while len(dims) < 3:
+        dims.append(1)
+    return (dims[0], dims[1], dims[2])
+
+
+def wgsl_sha256(wgsl_text: str) -> str:
+    return hashlib.sha256(wgsl_text.encode("utf-8")).hexdigest()
+
+
+def embedded_sha256(header_text: str) -> str:
+    m = _SHA_RE.search(header_text)
+    return m.group(1) if m else ""
+
+
+def render_header(wgsl_path, wgsl_text: str) -> str:
+    """Render the full <stem>_wgsl.h text for a shader (shader embedded verbatim)."""
+    if ')"' in wgsl_text:
+        raise ValueError('shader contains )" which would close the R"( literal')
+    stem = Path(wgsl_path).stem
+    base = symbol_base(stem)
+    x, y, z = parse_workgroup_size(wgsl_text)
+
+    head = [
+        _BSD_HEADER,
+        "",
+        "#pragma once",
+        "",
+        "#include <cstdint>",
+        "",
+        "namespace executorch::backends::webgpu {",
+        "",
+        f"// @generated from {stem}.wgsl - DO NOT EDIT.",
+        f"// wgsl-sha256: {wgsl_sha256(wgsl_text)}",
+        f'inline constexpr const char* k{base}WGSL = R"(',
+    ]
+    return (
+        "\n".join(head)
+        + "\n"
+        + wgsl_text
+        + ')";'
+        + "\n\n"
+        + f"inline constexpr uint32_t k{base}WorkgroupSizeX = {x};\n"
+        + f"inline constexpr uint32_t k{base}WorkgroupSizeY = {y};\n"
+        + f"inline constexpr uint32_t k{base}WorkgroupSizeZ = {z};\n\n"
+        + "} // namespace executorch::backends::webgpu\n"
+    )
+
+
+def discover():
+    """All shader sources under runtime/ops, sorted."""
+    return sorted((BACKEND_ROOT / "runtime/ops").glob("**/*.wgsl"))
+
+
+def _report_drift(missing, stale) -> None:
+    """Print the --check report for missing/stale committed headers."""
+    if missing:
+        print("Missing embedded WGSL headers (run scripts/gen_wgsl_headers.py):")
+        for h in missing:
+            print(f"  {h.relative_to(BACKEND_ROOT)}")
+    if stale:
+        print("Stale embedded WGSL headers (run scripts/gen_wgsl_headers.py):")
+        for h in stale:
+            print(f"  {h.relative_to(BACKEND_ROOT)}")
+
+
+def main(argv=None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="verify committed headers match (exit 1 on drift)",
+    )
+    args = parser.parse_args(argv)
+
+    stale = []
+    missing = []
+    errors = []
+    for wgsl in discover():
+        wgsl_text = wgsl.read_text()
+        try:
+            want = render_header(wgsl, wgsl_text)
+        except ValueError as e:
+            errors.append(f"{wgsl.relative_to(BACKEND_ROOT)}: {e}")
+            continue
+        header = wgsl.with_name(wgsl.stem + "_wgsl.h")
+        # Full-content compare (not just the sha) catches generator-logic drift too.
+        if header.exists() and header.read_text() == want:
+            continue
+        if args.check:
+            (missing if not header.exists() else stale).append(header)
+        else:
+            header.write_text(want)
+
+    if errors:
+        print("Cannot generate header (malformed shader):")
+        for e in errors:
+            print(f"  {e}")
+        return 1
+    if args.check and (stale or missing):
+        _report_drift(missing, stale)
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
@@ -15,6 +15,13 @@ EXECUTORCH_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
 PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
 NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu)
 
+echo "=== Check embedded WGSL headers are up to date ==="
+"${PYTHON_EXECUTABLE}" "${SCRIPT_DIR}/../scripts/gen_wgsl_headers.py" --check \
+  || { echo "ERROR: *_wgsl.h out of sync with .wgsl; run scripts/gen_wgsl_headers.py"; exit 1; }
+
+# Unit tests for the WGSL header generator itself
+$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/test_wgsl_codegen.py" -v
+
 # ── Step 1: Python export tests ──────────────────────────────────────────────
 
 echo "=== Step 1: Run Python export tests ==="
diff --git a/backends/webgpu/test/test_wgsl_codegen.py b/backends/webgpu/test/test_wgsl_codegen.py