pytorch
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/build-qnn-direct-sdk.sh‎
Lines changed: 33 additions & 0 deletions b/‎.ci/scripts/build-qnn-direct-sdk.sh‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎.ci/scripts/setup-webgpu-linux-deps.sh‎
Lines changed: 83 additions & 16 deletions b/‎.ci/scripts/setup-webgpu-linux-deps.sh‎
Lines changed: 83 additions & 16 deletions
diff --git a/‎.ci/scripts/test_backend.sh‎
Lines changed: 2 additions & 3 deletions b/‎.ci/scripts/test_backend.sh‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 3 additions & 2 deletions b/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 14 additions & 0 deletions b/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/workflows/_test_cortex_m_e2e.yml‎
Lines changed: 8 additions & 2 deletions b/‎.github/workflows/_test_cortex_m_e2e.yml‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎.github/workflows/cuda-windows.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/cuda-windows.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/mlx.yml‎
Lines changed: 13 additions & 14 deletions b/‎.github/workflows/mlx.yml‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 19 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 19 additions & 0 deletions
@@ -102,7 +102,7 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
-if [[ "${GCC_VERSION:-}" == "11" && -z "${SKIP_PYTORCH:-}" ]]; then
+if [[ -n "${GCC_VERSION:-}" && -z "${SKIP_PYTORCH:-}" ]]; then
   PYTORCH_BUILD_MAX_JOBS=6
 fi
 
 
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eux
+
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+
+setup_android_ndk
+install_qnn
+install_hexagon_sdk
+
+bash backends/qualcomm/scripts/build.sh \
+    --build_direct_mode 3 --soc_model SM8750 \
+    --skip_x86_64 --skip_linux_android \
+    --release
+
+ARTIFACT="build-direct/backends/qualcomm/libqnn_executorch_backend.so"
+if [ ! -f "${ARTIFACT}" ]; then
+    echo "ERROR: direct-mode build did not produce ${ARTIFACT}" >&2
+    exit 1
+fi
+
+MAX_SIZE_BYTES=$((200 * 1024))
+ARTIFACT_SIZE=$(stat -c%s "${ARTIFACT}")
+if [ "${ARTIFACT_SIZE}" -gt "${MAX_SIZE_BYTES}" ]; then
+    echo "ERROR: ${ARTIFACT} is ${ARTIFACT_SIZE} bytes, exceeds ${MAX_SIZE_BYTES}-byte (200 KiB) limit" >&2
+    exit 1
+fi
+echo "PASSED: direct-mode build produced ${ARTIFACT} (${ARTIFACT_SIZE} bytes, under ${MAX_SIZE_BYTES}-byte limit)"
@@ -5,26 +5,93 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Vendor Dawn (Tint) + SwiftShader for the WebGPU backend CI WITHOUT hosting a
+# private prebuilt:
+#   * Dawn  : Google's official nightly prebuilt, downloaded directly from
+#             github.com/google/dawn/releases (pinned tag+rev+sha256) -- the same
+#             "fetch a pinned upstream prebuilt" pattern used for other CI deps.
+#   * SwiftShader : built from source at a pinned rev compatible with the Dawn
+#             above (the ossci prebuilt is from 2020, too old for current Dawn). No S3.
+# Dawn (Chrome's WebGPU impl; its WGSL compiler Tint is the spec reference) on
+# SwiftShader gives a headless, deterministic, spec-faithful CLI backend.
+#
+# Exports Dawn_DIR / VK_ICD_FILENAMES / LD_LIBRARY_PATH for the cmake build+run.
+# Local/rig override: set DAWN_PREBUILT_DIR=<dir containing lib64/cmake/Dawn> to
+# skip the Dawn download.
 set -ex
 
-# SwiftShader: software Vulkan adapter for GPU-less CI (LunarG SDK not needed).
-install_swiftshader() {
-  _https_amazon_aws=https://ossci-android.s3.amazonaws.com
-  _swiftshader_archive=swiftshader-abe07b943-prebuilt.tar.gz
-  _swiftshader_dir=/tmp/swiftshader
-  mkdir -p $_swiftshader_dir
+# --- pinned versions (bump rev+sha together when upgrading Dawn) --------------
+DAWN_TAG="${DAWN_TAG:-v20260423.175430}"
+DAWN_REV="${DAWN_REV:-31e25af254ab572c77054edec4946d2244e184dd}"
+DAWN_SHA256="${DAWN_SHA256:-ac76fac090162dc1ecea5ed0f28a557bb8f49efc47faab01886105ace82b7b64}"
+# SwiftShader rev verified compatible with DAWN_REV (the old ossci prebuilt is
+# from 2020 and is incompatible with current Dawn -> no adapter / zero compute).
+SWIFTSHADER_REV="${SWIFTSHADER_REV:-9898204d91d6a60b6a08ad74fe4ac52a6913111b}"
 
-  _tmp_archive="/tmp/${_swiftshader_archive}"
+_dawn_dir="${DAWN_PREBUILT_DIR:-/tmp/dawn-ci}"
+_ss_dir=/tmp/swiftshader
 
-  curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
-    --output "${_tmp_archive}" "$_https_amazon_aws/${_swiftshader_archive}"
+# --- toolchain prereqs --------------------------------------------------------
+# Dawn dlopens the system Vulkan loader at runtime (libvulkan1). And the
+# ubuntu-latest prebuilt is built with a bleeding-edge GCC: it references
+# libstdc++ symbols newer than ubuntu-22.04's default (e.g. _M_replace_cold,
+# GCC 13+), so the static .a won't link against the stock runtime. Pull a current
+# libstdc++ from the ubuntu-toolchain-r PPA when the symbol floor isn't met. All
+# of this is scoped to the WebGPU CI job; newer libstdc++ is backward-compatible.
+if command -v apt-get >/dev/null 2>&1; then
+  _SUDO=""; command -v sudo >/dev/null 2>&1 && _SUDO="sudo"
+  ${_SUDO} apt-get update -y || true
+  ${_SUDO} apt-get install -y libvulkan1 software-properties-common || true
+  if ! strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 2>/dev/null \
+      | grep -q "GLIBCXX_3.4.32"; then
+    ${_SUDO} add-apt-repository -y ppa:ubuntu-toolchain-r/test || true
+    ${_SUDO} apt-get update -y || true
+    ${_SUDO} apt-get install -y libstdc++6 || true  # newest GCC runtime
+  fi
+fi
 
-  tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}"
+# The native binaries / pybind lib run INSIDE the CI conda env, whose libstdc++
+# predates GLIBCXX_3.4.32 (the Dawn prebuilt's floor) -- the same wall ssjia hit
+# for the vulkan op tests. Upgrade the conda runtime libstdc++ so the loaded
+# libstdc++.so.6 (conda's, not the system one) satisfies Dawn at run time.
+if command -v conda >/dev/null 2>&1; then
+  conda install -y -c conda-forge "libstdcxx-ng>=14" || true
+fi
+
+# --- Dawn: official prebuilt from GitHub (no S3) ------------------------------
+mkdir -p "${_dawn_dir}"
+if [[ ! -d "${_dawn_dir}/lib64/cmake/Dawn" ]]; then
+  _dawn_tar="/tmp/Dawn-${DAWN_REV}-ubuntu-latest-Release.tar.gz"
+  curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
+    --output "${_dawn_tar}" \
+    "https://github.com/google/dawn/releases/download/${DAWN_TAG}/Dawn-${DAWN_REV}-ubuntu-latest-Release.tar.gz"
+  echo "${DAWN_SHA256}  ${_dawn_tar}" | sha256sum -c -
+  # archive top dir is Dawn-<rev>-ubuntu-latest-Release/{lib64,include,bin}
+  tar -C "${_dawn_dir}" --strip-components=1 -xzf "${_dawn_tar}"
+fi
 
-  export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json"
-  export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH}"
-  export ETVK_USING_SWIFTSHADER=1
-}
+# --- SwiftShader: build from source at a pinned rev (no S3) -------------------
+# The old ossci prebuilt (swiftshader-abe07b943, 2020) is incompatible with the
+# current Dawn; build a matching modern SwiftShader instead. Self-contained
+# cmake build (vendored LLVM); the ICD lands under build/<OS>/.
+if [[ ! -d "${_ss_dir}/build" ]]; then
+  if [[ ! -d "${_ss_dir}/.git" ]]; then
+    git clone https://github.com/google/swiftshader "${_ss_dir}"
+  fi
+  git -C "${_ss_dir}" checkout "${SWIFTSHADER_REV}"
+  # vk_swiftshader's deps are vendored in-tree; tolerate unreachable
+  # disabled-feature submodules (angle, test-only) failing to fetch.
+  git -C "${_ss_dir}" submodule update --init --recursive || true
+  cmake -S "${_ss_dir}" -B "${_ss_dir}/build" -DCMAKE_BUILD_TYPE=Release \
+    -DSWIFTSHADER_BUILD_TESTS=OFF -DSWIFTSHADER_BUILD_PVR=OFF \
+    -DSWIFTSHADER_BUILD_BENCHMARKS=OFF
+  cmake --build "${_ss_dir}/build" --parallel "$(nproc)" --target vk_swiftshader
+fi
+_ss_icd="$(find "${_ss_dir}/build" -name vk_swiftshader_icd.json 2>/dev/null | head -1)"
+[[ -n "${_ss_icd}" ]] || { echo "ERROR: SwiftShader ICD not found after build" >&2; exit 1; }
 
-install_swiftshader
-bash backends/webgpu/scripts/setup-wgpu-native.sh
+_ss_libdir="$(dirname "${_ss_icd}")"
+export Dawn_DIR="${_dawn_dir}/lib64/cmake/Dawn"
+export VK_ICD_FILENAMES="${_ss_icd}"
+export LD_LIBRARY_PATH="${_ss_libdir}:${LD_LIBRARY_PATH:-}"
+export WEBGPU_USING_SWIFTSHADER=1
@@ -58,11 +58,10 @@ if [[ "$FLOW" == *vulkan* ]]; then
 fi
 
 if [[ "$FLOW" == *webgpu* ]]; then
-    # Setup swiftshader (software Vulkan adapter for GPU-less runners) and wgpu-native,
-    # which are required to build and run the WebGPU delegate.
+    # Dawn (Tint) + SwiftShader, the spec-faithful headless WebGPU backend.
     source .ci/scripts/setup-webgpu-linux-deps.sh
 
-    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON"
+    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON -DDawn_DIR=$Dawn_DIR"
 fi
 
 if [[ "$FLOW" == *arm* ]]; then
 
@@ -14,13 +14,14 @@
 set -eu
 
 MODEL=$1
+TARGET=${2:-cortex-m55}
 script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
 et_root_dir=$(realpath "${script_dir}/../..")
 
-# Quantization is the default for the cortex-m55 target; run.sh's
+# Quantization is the default for cortex-m targets; run.sh's
 # arg parser only recognizes --no_quantize, so we omit any explicit flag.
 export ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA=True
 bash "${et_root_dir}/examples/arm/run.sh" \
     --model_name="${MODEL}" \
-    --target=cortex-m55 \
+    --target="${TARGET}" \
     --bundleio
@@ -2,6 +2,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
@@ -57,6 +59,18 @@ fi
 
 "${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh" --example
 
+# Enable VGF in pybind wheel builds when the platform-specific build input is
+# available from pip.
+if [[ "$UNAME_S" == "Linux" || "$UNAME_S" == "Darwin" ]]; then
+  if python3 -m pip install -r \
+    "${GITHUB_WORKSPACE}/${REPOSITORY}/backends/arm/requirements-arm-vgf-runtime.txt"; then
+    export EXECUTORCH_PYBIND_ENABLE_VGF=ON
+    echo "EXECUTORCH_PYBIND_ENABLE_VGF=ON" >> "${GITHUB_ENV}"
+  else
+    echo "VGF build dependency unavailable on this platform; building without VGF"
+  fi
+fi
+
 # Download Qualcomm QNN SDK on Linux x86_64 so the wheel build can include the
 # QNN backend.  The SDK is large, so we download it here (outside CMake) rather
 # than during cmake configure.
 
@@ -11,6 +11,11 @@ on:
         description: 'JSON array of model names to run on the Corstone-300 FVP, e.g. ["mv2", "mv3"]'
         required: true
         type: string
+      targets:
+        description: 'JSON array of cortex-m target CPUs to build the runner for, e.g. ["cortex-m55", "cortex-m7", "cortex-m0plus"]'
+        required: false
+        type: string
+        default: '["cortex-m55"]'
       timeout:
         description: 'Per-matrix-entry timeout in minutes'
         required: false
@@ -23,9 +28,10 @@ jobs:
     strategy:
       matrix:
         model: ${{ fromJSON(inputs.models) }}
+        target: ${{ fromJSON(inputs.targets) }}
       fail-fast: false
     with:
-      job-name: ${{ matrix.model }}
+      job-name: ${{ matrix.model }}-${{ matrix.target }}
       runner: linux.2xlarge.memory
       docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
       submodules: 'recursive'
@@ -44,4 +50,4 @@ jobs:
         source examples/arm/arm-scratch/setup_path.sh
 
         # Export and run model on FVP (run.sh internally builds the test runner).
-        bash .ci/scripts/test_cortex_m_e2e.sh ${{ matrix.model }}
+        bash .ci/scripts/test_cortex_m_e2e.sh "${{ matrix.model }}" "${{ matrix.target }}"
@@ -16,6 +16,7 @@ on:
       - .github/workflows/cuda-windows.yml
       - backends/cuda/**
       - backends/aoti/**
+      - extension/cuda/**
   workflow_dispatch:
 
 concurrency:
@@ -49,6 +50,7 @@ jobs:
       (
         contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
         contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+        contains(needs.changed-files.outputs.changed-files, 'extension/cuda') ||
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
@@ -150,6 +152,7 @@ jobs:
       (
         contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
         contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+        contains(needs.changed-files.outputs.changed-files, 'extension/cuda') ||
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
 
@@ -13,6 +13,7 @@ on:
       - backends/mlx/**
       - extension/llm/export/**
       - extension/audio/**
+      - examples/models/gemma4_31b/**
       - examples/models/parakeet/**
       - examples/models/voxtral_realtime/**
       - examples/models/qwen3_5_moe/**
@@ -77,6 +78,8 @@ jobs:
           backends/mlx/test/test_passes.py \
           backends/mlx/test/test_pattern_utils.py \
           backends/mlx/test/test_partitioner.py \
+          backends/mlx/test/test_serialization_dedup.py \
+          examples/models/gemma4_31b/quant/tests/test_pack_mlx.py \
           examples/models/gemma4_31b/tests/test_mlx_pipeline.py \
           -v
         echo "::endgroup::"
@@ -89,20 +92,16 @@ jobs:
           ./cmake-out/backends/mlx/test/multi_thread_test_runner
         echo "::endgroup::"
 
-        echo "::group::Run gated_delta_rule op tests"
-        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_gated_delta_rule run -v
-        echo "::endgroup::"
-
-        echo "::group::Run tq_norm op tests"
-        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq_norm run -v
-        echo "::endgroup::"
-
-        echo "::group::Run tq4_compress op tests"
-        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq4_compress run -v
-        echo "::endgroup::"
-
-        echo "::group::Run tq_dequant op tests"
-        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq_dequant run -v
+        echo "::group::Run custom_kernel_ops op tests"
+        # Run every custom_kernel_ops/**/test/test_*.py via its OpTestCase `run`
+        # CLI. Recurses into per-format subpackages (e.g. gguf/test), so adding a
+        # new op test file requires no change here.
+        set -e
+        for t in $(find backends/mlx/custom_kernel_ops -path '*/test/test_*.py' | sort); do
+          mod="executorch.$(echo "${t%.py}" | tr '/' '.')"
+          echo "--- ${mod} ---"
+          ${CONDA_RUN} python -m "${mod}" run -v
+        done
         echo "::endgroup::"
 
   test-mlx-qwen35-moe:
 
@@ -948,6 +948,25 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
 
+  test-qnn-direct-build-linux:
+    name: test-qnn-direct-build-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 30
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-direct-sdk.sh
+
   test-qnn-testsuite-linux:
     name: test-qnn-testsuite-linux
     permissions:
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ on:`
`16`	`16`	`- .github/workflows/cuda-windows.yml`
`17`	`17`	`- backends/cuda/**`
`18`	`18`	`- backends/aoti/**`
	`19`	`+ - extension/cuda/**`
`19`	`20`	`workflow_dispatch:`
`20`	`21`
`21`	`22`	`concurrency:`
`@@ -49,6 +50,7 @@ jobs:`
`49`	`50`	`(`
`50`	`51`	`contains(needs.changed-files.outputs.changed-files, 'backends/cuda') \|\|`
`51`	`52`	`contains(needs.changed-files.outputs.changed-files, 'backends/aoti') \|\|`
	`53`	`+ contains(needs.changed-files.outputs.changed-files, 'extension/cuda') \|\|`
`52`	`54`	`contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') \|\|`
`53`	`55`	`needs.run-decision.outputs.is-full-run == 'true'`
`54`	`56`	`)`
`@@ -150,6 +152,7 @@ jobs:`
`150`	`152`	`(`
`151`	`153`	`contains(needs.changed-files.outputs.changed-files, 'backends/cuda') \|\|`
`152`	`154`	`contains(needs.changed-files.outputs.changed-files, 'backends/aoti') \|\|`
	`155`	`+ contains(needs.changed-files.outputs.changed-files, 'extension/cuda') \|\|`
`153`	`156`	`contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') \|\|`
`154`	`157`	`needs.run-decision.outputs.is-full-run == 'true'`
`155`	`158`	`)`