From dc156bdb6c9c6f23ef0e3cb6b6038d88bd6055af Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Wed, 17 Jun 2026 16:19:39 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- backends/webgpu/CMakeLists.txt | 1 - .../webgpu/scripts/test_webgpu_native_ci.sh | 52 ++---- backends/webgpu/test/native/test_rms_norm.cpp | 173 ------------------ backends/webgpu/test/test_build_webgpu.sh | 29 +-- backends/webgpu/test/test_webgpu_native.cpp | 131 ------------- 5 files changed, 19 insertions(+), 367 deletions(-) delete mode 100644 backends/webgpu/test/native/test_rms_norm.cpp diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 2a3a8fc1cad..14064826814 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -138,7 +138,6 @@ endfunction() if(EXECUTORCH_BUILD_WEBGPU_TEST) add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp) - add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp) add_webgpu_native_test( webgpu_dispatch_order_test test/native/test_dispatch_order.cpp ) diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh index af000cc137f..ba6a48c62be 100644 --- a/backends/webgpu/scripts/test_webgpu_native_ci.sh +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -8,8 +8,9 @@ # Build + run the WebGPU native test executables on Dawn (Tint) + SwiftShader. # This is the substantive op-coverage gate: unlike the python operators suite # (which only delegates add.Tensor to WebGPU, the rest CPU-fallback), these -# executables run rms_norm / multi-dispatch ordering / scratch through the real -# WebGPU backend on Dawn. +# executables run quantized_linear / SDPA / update_cache / multi-dispatch +# ordering / scratch through the real WebGPU backend on Dawn. (Simple ops — +# add / rms_norm / the misc ops — run through the cases.py op-test framework.) # # Assumes the Dawn env is already sourced (Dawn_DIR + VK_ICD_FILENAMES + # LD_LIBRARY_PATH) via .ci/scripts/setup-webgpu-linux-deps.sh. For local runs: @@ -17,9 +18,9 @@ # bash backends/webgpu/scripts/test_webgpu_native_ci.sh # # Builds whatever native test targets are present in the landed tree (NOT a fixed -# list). This stack lands: webgpu_native_test, webgpu_rms_norm_test (base) + -# webgpu_dispatch_order_test, webgpu_scratch_buffer_test (D107576199) + -# webgpu_update_cache_test (D107547307). SDPA executables join once they land. +# list): webgpu_native_test (base) + webgpu_dispatch_order_test, +# webgpu_scratch_buffer_test (D107576199) + webgpu_update_cache_test +# (D107547307). SDPA executables join once they land. set -e @@ -37,33 +38,19 @@ fi cd "${EXECUTORCH_ROOT}" # ── Exports for the model-driven executables (best-effort) ─────────────────── -# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and -# self-skip if absent; scratch is standalone (generates its own inputs). -PTE_MODEL="/tmp/webgpu_add_test.pte" -PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" -RMS_NORM_DIR="/tmp/rmsn" -RMS_NORM_OK=1 +# native_test (quantized_linear/SDPA/update_cache) + dispatch_order read .pte/ +# golden inputs via env/dir and self-skip if absent; scratch is standalone. +# native_test itself is gated below on the executorch wheel being importable. DISPATCH_ORDER_DIR="/tmp/dispatch_order" DISPATCH_ORDER_OK=1 UPDATE_CACHE_DIR="/tmp/update_cache" UPDATE_CACHE_OK=1 -$PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model -export_add_model('${PTE_MODEL}') -export_chained_add_model('${PTE_CHAINED_MODEL}') -" || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent" - $PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models export_all_quantized_linear_models('/tmp') " || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test" -$PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases -export_rms_norm_cases('${RMS_NORM_DIR}') -" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; } - $PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases export_dispatch_order_cases('${DISPATCH_ORDER_DIR}') @@ -112,7 +99,7 @@ cmake \ "${EXECUTORCH_ROOT}" # ── Build + run every native test target that exists in this tree ──────────── -TARGETS=(webgpu_native_test webgpu_rms_norm_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test) +TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test) BIN_DIR="${BUILD_DIR}/backends/webgpu" # Which targets are defined depends on which diffs are landed (native_test + @@ -141,20 +128,17 @@ for t in "${TARGETS[@]}"; do done echo "=== Run native tests on Dawn + SwiftShader ===" -# native_test is model-driven; only run it if the export produced its .pte -# (CI's setup-linux.sh provides the executorch wheel so exports succeed; a bare -# local run without the wheel self-skips here rather than hard-failing on load). -if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then - env WEBGPU_TEST_MODEL="${PTE_MODEL}" \ - WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ - WEBGPU_TEST_SDPA_DIR=/tmp/ \ +# webgpu_native_test hosts the quantized_linear / SDPA / update_cache / symint +# sweeps. Gate on the executorch wheel being importable (the proxy for "the +# exports above ran"): CI has the wheel so they ran; a bare local run without it +# skips here rather than hard-failing the required-config guards. +if [[ -x "${BIN_DIR}/webgpu_native_test" ]] && + "${PYTHON_EXECUTABLE}" -c "import executorch" 2>/dev/null; then + env WEBGPU_TEST_SDPA_DIR=/tmp/ \ WEBGPU_TEST_QUANTIZED_LINEAR_DIR=/tmp/ \ "${BIN_DIR}/webgpu_native_test" else - echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)" -fi -if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then - "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}" + echo "(skipping webgpu_native_test: executorch wheel absent — exports did not run)" fi if [[ "${UPDATE_CACHE_OK}" == "1" && -x "${BIN_DIR}/webgpu_update_cache_test" ]]; then "${BIN_DIR}/webgpu_update_cache_test" "${UPDATE_CACHE_DIR}" diff --git a/backends/webgpu/test/native/test_rms_norm.cpp b/backends/webgpu/test/native/test_rms_norm.cpp deleted file mode 100644 index 7dbd5134096..00000000000 --- a/backends/webgpu/test/native/test_rms_norm.cpp +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace executorch::backends::webgpu; -using namespace executorch::extension; -using namespace executorch::runtime; - -namespace { - -struct RmsNormCase { - const char* name; - std::array sizes; -}; - -// Mirrors test_rms_norm.py _CASES; the .py writes per-case .pte/input/golden. -constexpr RmsNormCase kRmsNormCases[] = { - {"baseline", {1, 1, 7, 896}}, - {"width_eq_wg", {1, 1, 1, 64}}, - {"width_lt_wg", {1, 1, 1, 32}}, - {"width_1", {1, 1, 1, 1}}, - {"width_100", {1, 1, 1, 100}}, - {"width_130", {1, 1, 1, 130}}, - {"rank4_guard", {1, 5, 4, 128}}, - {"many_rows", {1, 1, 1024, 64}}, - {"distinct_rows", {1, 1, 5, 256}}, - {"single_row", {1, 1, 1, 896}}, - {"mixed_sign", {1, 1, 4, 128}}, - {"large_4096", {1, 1, 1, 4096}}, - {"large_8192", {1, 1, 1, 8192}}, - {"weight_zeros_neg", {1, 1, 1, 128}}, -}; - -std::vector read_f32_bin(const std::string& path) { - std::ifstream f(path, std::ios::binary | std::ios::ate); - if (!f) { - return {}; - } - // Truncate to a whole number of f32s so read() cannot overrun the vector. - const size_t bytes = - static_cast(f.tellg()) / sizeof(float) * sizeof(float); - f.seekg(0); - std::vector data(bytes / sizeof(float)); - f.read( - reinterpret_cast(data.data()), - static_cast(bytes)); - return data; -} - -bool run_case(const std::string& dir, const RmsNormCase& tc) { - printf("\n--- Test: rms_norm[%s] ---\n", tc.name); - const std::string base = dir + "/" + tc.name; - std::vector input = read_f32_bin(base + ".input.bin"); - std::vector golden = read_f32_bin(base + ".golden.bin"); - if (input.empty() || golden.empty()) { - printf("FAIL: could not read input/golden for %s\n", tc.name); - return false; - } - - Module module(base + ".pte"); - if (module.load_forward() != Error::Ok) { - printf("FAIL: could not load %s.pte\n", tc.name); - return false; - } - - std::vector sizes(tc.sizes.begin(), tc.sizes.end()); - size_t expected = 1; - for (int32_t d : tc.sizes) { - expected *= static_cast(d); - } - if (input.size() != expected) { - printf( - "FAIL: input numel %zu != expected %zu for %s\n", - input.size(), - expected, - tc.name); - return false; - } - auto x = make_tensor_ptr(sizes, std::vector(input)); - auto result = module.forward({EValue(x)}); - if (!result.ok()) { - printf("FAIL: forward failed (error %d)\n", (int)result.error()); - return false; - } - - const auto& outputs = result.get(); - if (outputs.empty() || !outputs[0].isTensor()) { - printf("FAIL: no tensor output\n"); - return false; - } - const auto& out_tensor = outputs[0].toTensor(); - if (static_cast(out_tensor.numel()) != golden.size()) { - printf( - "FAIL: output numel %zu != golden %zu\n", - (size_t)out_tensor.numel(), - golden.size()); - return false; - } - const float* out_data = out_tensor.const_data_ptr(); - - float max_abs_err = 0.0f; - float max_rel_err = 0.0f; - for (size_t i = 0; i < golden.size(); i++) { - const float abs_err = std::abs(out_data[i] - golden[i]); - max_abs_err = std::max(max_abs_err, abs_err); - const float denom = std::max(std::abs(golden[i]), 1e-6f); - max_rel_err = std::max(max_rel_err, abs_err / denom); - } - printf( - "Max abs error: %e Max rel error: %e (%zu elements)\n", - max_abs_err, - max_rel_err, - golden.size()); - if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) { - printf("FAIL: rms_norm[%s] exceeds tolerance 1e-3\n", tc.name); - return false; - } - printf("PASS: rms_norm[%s]\n", tc.name); - return true; -} - -} // namespace - -int main(int argc, char** argv) { - std::string dir = "/tmp/rmsn"; - if (argc > 1) { - dir = argv[1]; - } - if (const char* env = std::getenv("WEBGPU_RMS_NORM_DIR")) { - dir = env; - } - - WebGPUContext ctx; - try { - ctx = create_webgpu_context(); - } catch (const std::exception& e) { - printf("SKIP: %s\n", e.what()); - return 0; - } - set_default_webgpu_context(&ctx); - printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str()); - - bool ok = true; - for (const auto& tc : kRmsNormCases) { - ok = run_case(dir, tc) && ok; - } - - set_default_webgpu_context(nullptr); - destroy_webgpu_context(ctx); - - if (!ok) { - return 1; - } - printf("\nAll rms_norm tests passed\n"); - return 0; -} diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh index 6681499c055..5ea465e853b 100755 --- a/backends/webgpu/test/test_build_webgpu.sh +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -26,36 +26,18 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/test_wgsl_codegen.py" -v echo "=== Step 1: Run Python export tests ===" $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v -# Non-fatal: a rms_norm pytest failure skips the rms_norm native test below -# rather than aborting the whole run. -RMS_NORM_PYTEST_OK=1 -$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v \ - || RMS_NORM_PYTEST_OK=0 +$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v # ── Step 2: Export .pte model ───────────────────────────────────────────────── echo "=== Step 2: Export test models ===" -PTE_MODEL="/tmp/webgpu_add_test.pte" -PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" -RMS_NORM_DIR="/tmp/rmsn" DISPATCH_ORDER_DIR="/tmp/dispatch_order" PTE_UPDATE_CACHE_MODEL="/tmp/webgpu_update_cache_test.pte" cd "${EXECUTORCH_ROOT}" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model -export_add_model('${PTE_MODEL}') -export_chained_add_model('${PTE_CHAINED_MODEL}') -" -$PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases export_dispatch_order_cases('${DISPATCH_ORDER_DIR}') " -if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then - $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases -export_rms_norm_cases('${RMS_NORM_DIR}') -" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_PYTEST_OK=0; } -fi echo "=== Export update_cache model ===" UPDATE_CACHE_OK=1 @@ -113,7 +95,6 @@ cmake \ "${EXECUTORCH_ROOT}" cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC} -cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_rms_norm_test -j${NPROC} cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_dispatch_order_test -j${NPROC} cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_scratch_buffer_test -j${NPROC} @@ -125,18 +106,10 @@ else echo "(skipping update_cache native test: export did not complete)" fi env \ - WEBGPU_TEST_MODEL="${PTE_MODEL}" \ - WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ ${UPDATE_CACHE_ENV_VAR} \ WEBGPU_TEST_SDPA_DIR=/tmp/ \ "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test" -if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then - "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_rms_norm_test" "${RMS_NORM_DIR}" -else - echo "(skipping rms_norm native test: pytest or export did not complete)" -fi - "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}" "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_scratch_buffer_test" diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index ef643d33482..19ddbcb7158 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -24,118 +24,6 @@ using namespace executorch::backends::webgpu; using namespace executorch::extension; using namespace executorch::runtime; -static bool test_single_add(const std::string& model_path) { - printf("\n--- Test: single add (1024x1024) ---\n"); - - Module module(model_path); - auto err = module.load_forward(); - if (err != Error::Ok) { - printf("FAIL: could not load forward method (error %d)\n", (int)err); - return false; - } - printf("Model loaded: %s\n", model_path.c_str()); - - constexpr int dim = 1024; - constexpr int size = dim * dim; - - std::vector a_data(size); - std::vector b_data(size); - for (int i = 0; i < size; i++) { - a_data[i] = static_cast(i) * 1.0f; - b_data[i] = static_cast(i) * 2.0f; - } - - auto a = make_tensor_ptr({dim, dim}, std::vector(a_data)); - auto b = make_tensor_ptr({dim, dim}, std::vector(b_data)); - - auto result = module.forward({EValue(a), EValue(b)}); - if (!result.ok()) { - printf("FAIL: forward failed (error %d)\n", (int)result.error()); - return false; - } - - const auto& outputs = result.get(); - if (outputs.empty() || !outputs[0].isTensor()) { - printf("FAIL: no tensor output\n"); - return false; - } - - const auto& out_tensor = outputs[0].toTensor(); - const float* out_data = out_tensor.const_data_ptr(); - - float max_error = 0.0f; - int check_count = std::min(size, 1024); - for (int i = 0; i < check_count; i++) { - float expected = a_data[i] + b_data[i]; - float error = std::abs(out_data[i] - expected); - max_error = std::max(max_error, error); - } - - printf("Max error: %e (checked %d elements)\n", max_error, check_count); - if (max_error > 1e-3f) { - printf("FAIL: max error exceeds tolerance 1e-3\n"); - return false; - } - printf("PASS: single add test\n"); - return true; -} - -static bool test_chained_add(const std::string& model_path) { - printf("\n--- Test: chained add (1024x1024, 5 ops) ---\n"); - - Module module(model_path); - auto err = module.load_forward(); - if (err != Error::Ok) { - printf("FAIL: could not load forward method (error %d)\n", (int)err); - return false; - } - printf("Model loaded: %s\n", model_path.c_str()); - - constexpr int dim = 1024; - constexpr int size = dim * dim; - - std::vector x_data(size); - std::vector y_data(size); - for (int i = 0; i < size; i++) { - x_data[i] = static_cast(i % 100) * 0.01f; - y_data[i] = static_cast(i % 50) * 0.02f; - } - - auto x = make_tensor_ptr({dim, dim}, std::vector(x_data)); - auto y = make_tensor_ptr({dim, dim}, std::vector(y_data)); - - auto result = module.forward({EValue(x), EValue(y)}); - if (!result.ok()) { - printf("FAIL: forward failed (error %d)\n", (int)result.error()); - return false; - } - - const auto& outputs = result.get(); - if (outputs.empty() || !outputs[0].isTensor()) { - printf("FAIL: no tensor output\n"); - return false; - } - - // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y - const auto& out_tensor = outputs[0].toTensor(); - const float* out_data = out_tensor.const_data_ptr(); - - float max_error = 0.0f; - for (int i = 0; i < size; i++) { - float expected = 3.0f * x_data[i] + 3.0f * y_data[i]; - float error = std::abs(out_data[i] - expected); - max_error = std::max(max_error, error); - } - - printf("Max error: %e (checked %d elements)\n", max_error, size); - if (max_error > 1e-3f) { - printf("FAIL: max error exceeds tolerance 1e-3\n"); - return false; - } - printf("PASS: chained add test\n"); - return true; -} - #ifdef WGPU_BACKEND_ENABLE_PROFILING // Capacity-overrun must throw; runs without a device or TimestampQuery. static bool test_query_pool_overrun_throws() { @@ -1440,19 +1328,6 @@ static bool test_resize_hook(const std::string& blob_path) { } int main(int argc, char** argv) { - std::string model_path = "webgpu_add_test.pte"; - if (argc > 1) { - model_path = argv[1]; - } - if (const char* env = std::getenv("WEBGPU_TEST_MODEL")) { - model_path = env; - } - - std::string chained_model_path; - if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) { - chained_model_path = env; - } - std::string update_cache_model_path; if (const char* env = std::getenv("WEBGPU_TEST_UPDATE_CACHE_MODEL")) { update_cache_model_path = env; @@ -1494,12 +1369,6 @@ int main(int argc, char** argv) { ok = test_query_pool_overrun_throws() && ok; ok = test_query_pool_roundtrip(ctx) && ok; #endif // WGPU_BACKEND_ENABLE_PROFILING - ok = test_single_add(model_path) && ok; - - if (!chained_model_path.empty()) { - ok = test_chained_add(chained_model_path) && ok; - } - if (!update_cache_model_path.empty()) { ok = test_update_cache(update_cache_model_path) && ok; }