From dc156bdb6c9c6f23ef0e3cb6b6038d88bd6055af Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Wed, 17 Jun 2026 16:19:39 -0700
Subject: [PATCH] Update

[ghstack-poisoned]
---
 backends/webgpu/CMakeLists.txt                |   1 -
 .../webgpu/scripts/test_webgpu_native_ci.sh   |  52 ++----
 backends/webgpu/test/native/test_rms_norm.cpp | 173 ------------------
 backends/webgpu/test/test_build_webgpu.sh     |  29 +--
 backends/webgpu/test/test_webgpu_native.cpp   | 131 -------------
 5 files changed, 19 insertions(+), 367 deletions(-)
 delete mode 100644 backends/webgpu/test/native/test_rms_norm.cpp

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 2a3a8fc1cad..14064826814 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -138,7 +138,6 @@ endfunction()
 
 if(EXECUTORCH_BUILD_WEBGPU_TEST)
   add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
-  add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
   add_webgpu_native_test(
     webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
   )
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
index af000cc137f..ba6a48c62be 100644
--- a/backends/webgpu/scripts/test_webgpu_native_ci.sh
+++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -8,8 +8,9 @@
 # Build + run the WebGPU native test executables on Dawn (Tint) + SwiftShader.
 # This is the substantive op-coverage gate: unlike the python operators suite
 # (which only delegates add.Tensor to WebGPU, the rest CPU-fallback), these
-# executables run rms_norm / multi-dispatch ordering / scratch through the real
-# WebGPU backend on Dawn.
+# executables run quantized_linear / SDPA / update_cache / multi-dispatch
+# ordering / scratch through the real WebGPU backend on Dawn. (Simple ops —
+# add / rms_norm / the misc ops — run through the cases.py op-test framework.)
 #
 # Assumes the Dawn env is already sourced (Dawn_DIR + VK_ICD_FILENAMES +
 # LD_LIBRARY_PATH) via .ci/scripts/setup-webgpu-linux-deps.sh. For local runs:
@@ -17,9 +18,9 @@
 #   bash backends/webgpu/scripts/test_webgpu_native_ci.sh
 #
 # Builds whatever native test targets are present in the landed tree (NOT a fixed
-# list). This stack lands: webgpu_native_test, webgpu_rms_norm_test (base) +
-# webgpu_dispatch_order_test, webgpu_scratch_buffer_test (D107576199) +
-# webgpu_update_cache_test (D107547307). SDPA executables join once they land.
+# list): webgpu_native_test (base) + webgpu_dispatch_order_test,
+# webgpu_scratch_buffer_test (D107576199) + webgpu_update_cache_test
+# (D107547307). SDPA executables join once they land.
 
 set -e
 
@@ -37,33 +38,19 @@ fi
 cd "${EXECUTORCH_ROOT}"
 
 # ── Exports for the model-driven executables (best-effort) ───────────────────
-# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and
-# self-skip if absent; scratch is standalone (generates its own inputs).
-PTE_MODEL="/tmp/webgpu_add_test.pte"
-PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
-RMS_NORM_DIR="/tmp/rmsn"
-RMS_NORM_OK=1
+# native_test (quantized_linear/SDPA/update_cache) + dispatch_order read .pte/
+# golden inputs via env/dir and self-skip if absent; scratch is standalone.
+# native_test itself is gated below on the executorch wheel being importable.
 DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 DISPATCH_ORDER_OK=1
 UPDATE_CACHE_DIR="/tmp/update_cache"
 UPDATE_CACHE_OK=1
 
-$PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
-export_add_model('${PTE_MODEL}')
-export_chained_add_model('${PTE_CHAINED_MODEL}')
-" || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent"
-
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models
 export_all_quantized_linear_models('/tmp')
 " || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test"
 
-$PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
-export_rms_norm_cases('${RMS_NORM_DIR}')
-" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; }
-
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
 export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
@@ -112,7 +99,7 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 # ── Build + run every native test target that exists in this tree ────────────
-TARGETS=(webgpu_native_test webgpu_rms_norm_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
+TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
 BIN_DIR="${BUILD_DIR}/backends/webgpu"
 
 # Which targets are defined depends on which diffs are landed (native_test +
@@ -141,20 +128,17 @@ for t in "${TARGETS[@]}"; do
 done
 
 echo "=== Run native tests on Dawn + SwiftShader ==="
-# native_test is model-driven; only run it if the export produced its .pte
-# (CI's setup-linux.sh provides the executorch wheel so exports succeed; a bare
-# local run without the wheel self-skips here rather than hard-failing on load).
-if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then
-  env WEBGPU_TEST_MODEL="${PTE_MODEL}" \
-      WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
-      WEBGPU_TEST_SDPA_DIR=/tmp/ \
+# webgpu_native_test hosts the quantized_linear / SDPA / update_cache / symint
+# sweeps. Gate on the executorch wheel being importable (the proxy for "the
+# exports above ran"): CI has the wheel so they ran; a bare local run without it
+# skips here rather than hard-failing the required-config guards.
+if [[ -x "${BIN_DIR}/webgpu_native_test" ]] &&
+  "${PYTHON_EXECUTABLE}" -c "import executorch" 2>/dev/null; then
+  env WEBGPU_TEST_SDPA_DIR=/tmp/ \
       WEBGPU_TEST_QUANTIZED_LINEAR_DIR=/tmp/ \
       "${BIN_DIR}/webgpu_native_test"
 else
-  echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)"
-fi
-if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then
-  "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
+  echo "(skipping webgpu_native_test: executorch wheel absent — exports did not run)"
 fi
 if [[ "${UPDATE_CACHE_OK}" == "1" && -x "${BIN_DIR}/webgpu_update_cache_test" ]]; then
   "${BIN_DIR}/webgpu_update_cache_test" "${UPDATE_CACHE_DIR}"
diff --git a/backends/webgpu/test/native/test_rms_norm.cpp b/backends/webgpu/test/native/test_rms_norm.cpp
deleted file mode 100644
index 7dbd5134096..00000000000
--- a/backends/webgpu/test/native/test_rms_norm.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor.h>
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <fstream>
-#include <string>
-#include <vector>
-
-using namespace executorch::backends::webgpu;
-using namespace executorch::extension;
-using namespace executorch::runtime;
-
-namespace {
-
-struct RmsNormCase {
-  const char* name;
-  std::array<int32_t, 4> sizes;
-};
-
-// Mirrors test_rms_norm.py _CASES; the .py writes per-case .pte/input/golden.
-constexpr RmsNormCase kRmsNormCases[] = {
-    {"baseline", {1, 1, 7, 896}},
-    {"width_eq_wg", {1, 1, 1, 64}},
-    {"width_lt_wg", {1, 1, 1, 32}},
-    {"width_1", {1, 1, 1, 1}},
-    {"width_100", {1, 1, 1, 100}},
-    {"width_130", {1, 1, 1, 130}},
-    {"rank4_guard", {1, 5, 4, 128}},
-    {"many_rows", {1, 1, 1024, 64}},
-    {"distinct_rows", {1, 1, 5, 256}},
-    {"single_row", {1, 1, 1, 896}},
-    {"mixed_sign", {1, 1, 4, 128}},
-    {"large_4096", {1, 1, 1, 4096}},
-    {"large_8192", {1, 1, 1, 8192}},
-    {"weight_zeros_neg", {1, 1, 1, 128}},
-};
-
-std::vector<float> read_f32_bin(const std::string& path) {
-  std::ifstream f(path, std::ios::binary | std::ios::ate);
-  if (!f) {
-    return {};
-  }
-  // Truncate to a whole number of f32s so read() cannot overrun the vector.
-  const size_t bytes =
-      static_cast<size_t>(f.tellg()) / sizeof(float) * sizeof(float);
-  f.seekg(0);
-  std::vector<float> data(bytes / sizeof(float));
-  f.read(
-      reinterpret_cast<char*>(data.data()),
-      static_cast<std::streamsize>(bytes));
-  return data;
-}
-
-bool run_case(const std::string& dir, const RmsNormCase& tc) {
-  printf("\n--- Test: rms_norm[%s] ---\n", tc.name);
-  const std::string base = dir + "/" + tc.name;
-  std::vector<float> input = read_f32_bin(base + ".input.bin");
-  std::vector<float> golden = read_f32_bin(base + ".golden.bin");
-  if (input.empty() || golden.empty()) {
-    printf("FAIL: could not read input/golden for %s\n", tc.name);
-    return false;
-  }
-
-  Module module(base + ".pte");
-  if (module.load_forward() != Error::Ok) {
-    printf("FAIL: could not load %s.pte\n", tc.name);
-    return false;
-  }
-
-  std::vector<int32_t> sizes(tc.sizes.begin(), tc.sizes.end());
-  size_t expected = 1;
-  for (int32_t d : tc.sizes) {
-    expected *= static_cast<size_t>(d);
-  }
-  if (input.size() != expected) {
-    printf(
-        "FAIL: input numel %zu != expected %zu for %s\n",
-        input.size(),
-        expected,
-        tc.name);
-    return false;
-  }
-  auto x = make_tensor_ptr(sizes, std::vector<float>(input));
-  auto result = module.forward({EValue(x)});
-  if (!result.ok()) {
-    printf("FAIL: forward failed (error %d)\n", (int)result.error());
-    return false;
-  }
-
-  const auto& outputs = result.get();
-  if (outputs.empty() || !outputs[0].isTensor()) {
-    printf("FAIL: no tensor output\n");
-    return false;
-  }
-  const auto& out_tensor = outputs[0].toTensor();
-  if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
-    printf(
-        "FAIL: output numel %zu != golden %zu\n",
-        (size_t)out_tensor.numel(),
-        golden.size());
-    return false;
-  }
-  const float* out_data = out_tensor.const_data_ptr<float>();
-
-  float max_abs_err = 0.0f;
-  float max_rel_err = 0.0f;
-  for (size_t i = 0; i < golden.size(); i++) {
-    const float abs_err = std::abs(out_data[i] - golden[i]);
-    max_abs_err = std::max(max_abs_err, abs_err);
-    const float denom = std::max(std::abs(golden[i]), 1e-6f);
-    max_rel_err = std::max(max_rel_err, abs_err / denom);
-  }
-  printf(
-      "Max abs error: %e   Max rel error: %e (%zu elements)\n",
-      max_abs_err,
-      max_rel_err,
-      golden.size());
-  if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) {
-    printf("FAIL: rms_norm[%s] exceeds tolerance 1e-3\n", tc.name);
-    return false;
-  }
-  printf("PASS: rms_norm[%s]\n", tc.name);
-  return true;
-}
-
-} // namespace
-
-int main(int argc, char** argv) {
-  std::string dir = "/tmp/rmsn";
-  if (argc > 1) {
-    dir = argv[1];
-  }
-  if (const char* env = std::getenv("WEBGPU_RMS_NORM_DIR")) {
-    dir = env;
-  }
-
-  WebGPUContext ctx;
-  try {
-    ctx = create_webgpu_context();
-  } catch (const std::exception& e) {
-    printf("SKIP: %s\n", e.what());
-    return 0;
-  }
-  set_default_webgpu_context(&ctx);
-  printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());
-
-  bool ok = true;
-  for (const auto& tc : kRmsNormCases) {
-    ok = run_case(dir, tc) && ok;
-  }
-
-  set_default_webgpu_context(nullptr);
-  destroy_webgpu_context(ctx);
-
-  if (!ok) {
-    return 1;
-  }
-  printf("\nAll rms_norm tests passed\n");
-  return 0;
-}
diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
index 6681499c055..5ea465e853b 100755
--- a/backends/webgpu/test/test_build_webgpu.sh
+++ b/backends/webgpu/test/test_build_webgpu.sh
@@ -26,36 +26,18 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/test_wgsl_codegen.py" -v
 
 echo "=== Step 1: Run Python export tests ==="
 $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v
-# Non-fatal: a rms_norm pytest failure skips the rms_norm native test below
-# rather than aborting the whole run.
-RMS_NORM_PYTEST_OK=1
-$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v \
-    || RMS_NORM_PYTEST_OK=0
+$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v
 
 # ── Step 2: Export .pte model ─────────────────────────────────────────────────
 
 echo "=== Step 2: Export test models ==="
-PTE_MODEL="/tmp/webgpu_add_test.pte"
-PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
-RMS_NORM_DIR="/tmp/rmsn"
 DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 PTE_UPDATE_CACHE_MODEL="/tmp/webgpu_update_cache_test.pte"
 cd "${EXECUTORCH_ROOT}"
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
-export_add_model('${PTE_MODEL}')
-export_chained_add_model('${PTE_CHAINED_MODEL}')
-"
-$PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
 export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
 "
-if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then
-  $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
-export_rms_norm_cases('${RMS_NORM_DIR}')
-" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_PYTEST_OK=0; }
-fi
 
 echo "=== Export update_cache model ==="
 UPDATE_CACHE_OK=1
@@ -113,7 +95,6 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC}
-cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_rms_norm_test -j${NPROC}
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_dispatch_order_test -j${NPROC}
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_scratch_buffer_test -j${NPROC}
 
@@ -125,18 +106,10 @@ else
   echo "(skipping update_cache native test: export did not complete)"
 fi
 env \
-    WEBGPU_TEST_MODEL="${PTE_MODEL}" \
-    WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
     ${UPDATE_CACHE_ENV_VAR} \
     WEBGPU_TEST_SDPA_DIR=/tmp/ \
     "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test"
 
-if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then
-  "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
-else
-  echo "(skipping rms_norm native test: pytest or export did not complete)"
-fi
-
 "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
 "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_scratch_buffer_test"
 
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index ef643d33482..19ddbcb7158 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -24,118 +24,6 @@ using namespace executorch::backends::webgpu;
 using namespace executorch::extension;
 using namespace executorch::runtime;
 
-static bool test_single_add(const std::string& model_path) {
-  printf("\n--- Test: single add (1024x1024) ---\n");
-
-  Module module(model_path);
-  auto err = module.load_forward();
-  if (err != Error::Ok) {
-    printf("FAIL: could not load forward method (error %d)\n", (int)err);
-    return false;
-  }
-  printf("Model loaded: %s\n", model_path.c_str());
-
-  constexpr int dim = 1024;
-  constexpr int size = dim * dim;
-
-  std::vector<float> a_data(size);
-  std::vector<float> b_data(size);
-  for (int i = 0; i < size; i++) {
-    a_data[i] = static_cast<float>(i) * 1.0f;
-    b_data[i] = static_cast<float>(i) * 2.0f;
-  }
-
-  auto a = make_tensor_ptr({dim, dim}, std::vector<float>(a_data));
-  auto b = make_tensor_ptr({dim, dim}, std::vector<float>(b_data));
-
-  auto result = module.forward({EValue(a), EValue(b)});
-  if (!result.ok()) {
-    printf("FAIL: forward failed (error %d)\n", (int)result.error());
-    return false;
-  }
-
-  const auto& outputs = result.get();
-  if (outputs.empty() || !outputs[0].isTensor()) {
-    printf("FAIL: no tensor output\n");
-    return false;
-  }
-
-  const auto& out_tensor = outputs[0].toTensor();
-  const float* out_data = out_tensor.const_data_ptr<float>();
-
-  float max_error = 0.0f;
-  int check_count = std::min(size, 1024);
-  for (int i = 0; i < check_count; i++) {
-    float expected = a_data[i] + b_data[i];
-    float error = std::abs(out_data[i] - expected);
-    max_error = std::max(max_error, error);
-  }
-
-  printf("Max error: %e (checked %d elements)\n", max_error, check_count);
-  if (max_error > 1e-3f) {
-    printf("FAIL: max error exceeds tolerance 1e-3\n");
-    return false;
-  }
-  printf("PASS: single add test\n");
-  return true;
-}
-
-static bool test_chained_add(const std::string& model_path) {
-  printf("\n--- Test: chained add (1024x1024, 5 ops) ---\n");
-
-  Module module(model_path);
-  auto err = module.load_forward();
-  if (err != Error::Ok) {
-    printf("FAIL: could not load forward method (error %d)\n", (int)err);
-    return false;
-  }
-  printf("Model loaded: %s\n", model_path.c_str());
-
-  constexpr int dim = 1024;
-  constexpr int size = dim * dim;
-
-  std::vector<float> x_data(size);
-  std::vector<float> y_data(size);
-  for (int i = 0; i < size; i++) {
-    x_data[i] = static_cast<float>(i % 100) * 0.01f;
-    y_data[i] = static_cast<float>(i % 50) * 0.02f;
-  }
-
-  auto x = make_tensor_ptr({dim, dim}, std::vector<float>(x_data));
-  auto y = make_tensor_ptr({dim, dim}, std::vector<float>(y_data));
-
-  auto result = module.forward({EValue(x), EValue(y)});
-  if (!result.ok()) {
-    printf("FAIL: forward failed (error %d)\n", (int)result.error());
-    return false;
-  }
-
-  const auto& outputs = result.get();
-  if (outputs.empty() || !outputs[0].isTensor()) {
-    printf("FAIL: no tensor output\n");
-    return false;
-  }
-
-  // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y
-  const auto& out_tensor = outputs[0].toTensor();
-  const float* out_data = out_tensor.const_data_ptr<float>();
-
-  float max_error = 0.0f;
-  for (int i = 0; i < size; i++) {
-    float expected = 3.0f * x_data[i] + 3.0f * y_data[i];
-    float error = std::abs(out_data[i] - expected);
-    max_error = std::max(max_error, error);
-  }
-
-  printf("Max error: %e (checked %d elements)\n", max_error, size);
-  if (max_error > 1e-3f) {
-    printf("FAIL: max error exceeds tolerance 1e-3\n");
-    return false;
-  }
-  printf("PASS: chained add test\n");
-  return true;
-}
-
 #ifdef WGPU_BACKEND_ENABLE_PROFILING
 // Capacity-overrun must throw; runs without a device or TimestampQuery.
 static bool test_query_pool_overrun_throws() {
@@ -1440,19 +1328,6 @@ static bool test_resize_hook(const std::string& blob_path) {
 }
 
 int main(int argc, char** argv) {
-  std::string model_path = "webgpu_add_test.pte";
-  if (argc > 1) {
-    model_path = argv[1];
-  }
-  if (const char* env = std::getenv("WEBGPU_TEST_MODEL")) {
-    model_path = env;
-  }
-
-  std::string chained_model_path;
-  if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) {
-    chained_model_path = env;
-  }
-
   std::string update_cache_model_path;
   if (const char* env = std::getenv("WEBGPU_TEST_UPDATE_CACHE_MODEL")) {
     update_cache_model_path = env;
@@ -1494,12 +1369,6 @@ int main(int argc, char** argv) {
   ok = test_query_pool_overrun_throws() && ok;
   ok = test_query_pool_roundtrip(ctx) && ok;
 #endif // WGPU_BACKEND_ENABLE_PROFILING
-  ok = test_single_add(model_path) && ok;
-
-  if (!chained_model_path.empty()) {
-    ok = test_chained_add(chained_model_path) && ok;
-  }
-
   if (!update_cache_model_path.empty()) {
     ok = test_update_cache(update_cache_model_path) && ok;
   }