pytorch
diff --git a/‎.github/workflows/mlx.yml‎
Lines changed: 13 additions & 14 deletions b/‎.github/workflows/mlx.yml‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 4 additions & 2 deletions b/‎Makefile‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/aoti/aoti_partitioner.py‎
Lines changed: 28 additions & 10 deletions b/‎backends/aoti/aoti_partitioner.py‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎backends/arm/README.md‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_duplicate_users_pass.py‎
Lines changed: 9 additions & 5 deletions b/‎backends/arm/_passes/fuse_duplicate_users_pass.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎backends/arm/_passes/rewrite_adaptive_avg_pool2d.py‎
Lines changed: 170 additions & 0 deletions b/‎backends/arm/_passes/rewrite_adaptive_avg_pool2d.py‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎backends/arm/_passes/rewrite_avg_pool2d_pass.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/arm/_passes/rewrite_avg_pool2d_pass.py‎
Lines changed: 3 additions & 1 deletion
@@ -13,6 +13,7 @@ on:
       - backends/mlx/**
       - extension/llm/export/**
       - extension/audio/**
+      - examples/models/gemma4_31b/**
       - examples/models/parakeet/**
       - examples/models/voxtral_realtime/**
       - examples/models/qwen3_5_moe/**
@@ -77,6 +78,8 @@ jobs:
           backends/mlx/test/test_passes.py \
           backends/mlx/test/test_pattern_utils.py \
           backends/mlx/test/test_partitioner.py \
+          backends/mlx/test/test_serialization_dedup.py \
+          examples/models/gemma4_31b/quant/tests/test_pack_mlx.py \
           examples/models/gemma4_31b/tests/test_mlx_pipeline.py \
           -v
         echo "::endgroup::"
@@ -89,20 +92,16 @@ jobs:
           ./cmake-out/backends/mlx/test/multi_thread_test_runner
         echo "::endgroup::"
 
-        echo "::group::Run gated_delta_rule op tests"
-        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_gated_delta_rule run -v
-        echo "::endgroup::"
-
-        echo "::group::Run tq_norm op tests"
-        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq_norm run -v
-        echo "::endgroup::"
-
-        echo "::group::Run tq4_compress op tests"
-        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq4_compress run -v
-        echo "::endgroup::"
-
-        echo "::group::Run tq_dequant op tests"
-        ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq_dequant run -v
+        echo "::group::Run custom_kernel_ops op tests"
+        # Run every custom_kernel_ops/**/test/test_*.py via its OpTestCase `run`
+        # CLI. Recurses into per-format subpackages (e.g. gguf/test), so adding a
+        # new op test file requires no change here.
+        set -e
+        for t in $(find backends/mlx/custom_kernel_ops -path '*/test/test_*.py' | sort); do
+          mod="executorch.$(echo "${t%.py}" | tr '/' '.')"
+          echo "--- ${mod} ---"
+          ${CONDA_RUN} python -m "${mod}" run -v
+        done
         echo "::endgroup::"
 
   test-mlx-qwen35-moe:
 
@@ -258,6 +258,7 @@ jobs:
           - test_arm_backend: test_pytest_models_ethos_u85
           - test_arm_backend: test_run_ethos_u85
           - test_arm_backend: test_smaller_stories_llama_tosa
+          - test_arm_backend: test_model_smollm2_135M_ethos_u85
           - test_arm_backend: test_memory_allocation
           - test_arm_backend: test_ootb_tests_ethos_u
           - test_arm_backend: test_ootb_tests_tosa
 
@@ -261,7 +261,8 @@ parakeet-vulkan:
 
 dinov2-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
-	cmake --workflow --preset llm-release-cuda
+	cmake --preset llm-release-cuda -DEXECUTORCH_BUILD_EXTENSION_IMAGE=ON
+	cmake --build --preset llm-release-cuda-install
 	@echo "==> Building DINOv2 runner with CUDA..."
 	cd examples/models/dinov2 && cmake --workflow --preset dinov2-cuda
 	@echo ""
@@ -270,7 +271,8 @@ dinov2-cuda:
 
 dinov2-cuda-debug:
 	@echo "==> Building and installing ExecuTorch with CUDA (debug mode)..."
-	cmake --workflow --preset llm-debug-cuda
+	cmake --preset llm-debug-cuda -DEXECUTORCH_BUILD_EXTENSION_IMAGE=ON
+	cmake --build --preset llm-debug-cuda-install
 	@echo "==> Building DINOv2 runner with CUDA (debug mode)..."
 	cd examples/models/dinov2 && cmake --workflow --preset dinov2-cuda-debug
 	@echo ""
 
@@ -14,7 +14,11 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
+from executorch.exir.backend.utils import (
+    get_non_lowered_nodes,
+    tag_constant_data,
+    tag_mutated_buffer,
+)
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
 
@@ -60,8 +64,17 @@ def is_control_flow(node: torch.fx.Node) -> bool:
                 torch.ops.higher_order.while_loop,
             ]
 
+        # Nodes already lowered by an earlier partitioner (e.g. a preceding
+        # TensorRT partition) appear as executorch_call_delegate calls and their
+        # output getitems; re-delegating them would nest a foreign delegate. Tag
+        # only the remaining non-lowered ops so this partitioner composes after
+        # others.
+        non_lowered_nodes = set(get_non_lowered_nodes(exported_program.graph))
+
         for node in exported_program.graph.nodes:
             if node.op == "call_function":
+                if node not in non_lowered_nodes:
+                    continue
                 node.meta["delegation_tag"] = tag
             # Tag get_attr nodes that are used by control flow operations
             elif node.op == "get_attr":
@@ -76,17 +89,22 @@ def is_control_flow(node: torch.fx.Node) -> bool:
         tag_constant_data(exported_program)
         tag_mutated_buffer(exported_program)
 
-        # Tag constant placeholders that have no users
-        # tag_constant_data only tags constants that have users with delegation_tag
-        # but we need to tag all constants for this partition
+        # A constant that still has users feeds only a prior delegate; tagging it
+        # would fail backend lowering's same-tag check (its user keeps the prior
+        # tag). tag_constant_data already claimed the ones this partition uses, so
+        # tag only the genuinely unused constants here.
         for node in exported_program.graph.nodes:
-            if node.op == "placeholder" and (
-                is_param(exported_program, node)
-                or is_buffer(exported_program, node)
-                or is_lifted_tensor_constant(exported_program, node)
+            if (
+                node.op == "placeholder"
+                and not node.users
+                and "delegation_tag" not in node.meta
+                and (
+                    is_param(exported_program, node)
+                    or is_buffer(exported_program, node)
+                    or is_lifted_tensor_constant(exported_program, node)
+                )
             ):
-                if "delegation_tag" not in node.meta:
-                    node.meta["delegation_tag"] = tag
+                node.meta["delegation_tag"] = tag
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
 
@@ -251,7 +251,7 @@ Below is an overview of some of the testing options this script provides:
 | `test_arm_backend.sh test_pytest_ops_vkml`         | Runs operator unit tests for VKML/VGF specific use-cases.    |
 | `test_arm_backend.sh test_pytest_models_vkml`      | Runs model unit tests for VKML/VGF specific use-cases.       |
 | `test_arm_backend.sh test_run_vkml`                | Runs end-to-end unit tests for VKML/VGF specific use-cases.  |
-| `test_arm_backend.sh test_model_smollm2_135M`      | Runs some models with Corstone FVP.                          |
+| `test_arm_backend.sh test_model_smollm2_135M_ethos_u85`      | Runs smollm2_135M for Ethos-U85 specific use-cases.                          |
 | `test_arm_backend.sh test_ootb_tests_ethos_u`      | Runs out-of-the-box tests for Ethos-U.                       |
 | `test_arm_backend.sh test_ootb_tests_tosa`         | Runs out-of-the-box tests for TOSA.                          |
 | `test_arm_backend.sh test_ootb_tests_vgf`          | Runs out-of-the-box tests for VKML/VGF.                      |
 
@@ -149,6 +149,7 @@
 from .replace_scalar_with_tensor_pass import (  # noqa
     ReplaceScalarWithTensorByProfilePass,
 )
+from .rewrite_adaptive_avg_pool2d import RewriteAdaptiveAvgPool2dPass  # noqa
 from .rewrite_avg_pool2d_pass import RewriteAvgPool2dPass  # noqa
 from .rewrite_bool_bitwise_to_logical_pass import (  # noqa
     RewriteBoolBitwiseToLogicalPass,
 
@@ -131,6 +131,7 @@
     RemovePermutesAroundElementwiseTosaOps,
     ReplaceInfAndLimitValuesPass,
     ReplaceScalarWithTensorByProfilePass,
+    RewriteAdaptiveAvgPool2dPass,
     RewriteAvgPool2dPass,
     RewriteBoolBitwiseToLogicalPass,
     RewriteBoolToFp32CastViaInt8Pass,
@@ -504,6 +505,7 @@ def _tosa_pipeline(
                 DecomposeAsStridedCopyPass(),
                 DecomposeMaxPool2dPass(),
                 SizeAdjustInputPass(),
+                RewriteAdaptiveAvgPool2dPass(),
                 RewriteAvgPool2dPass(),
                 ComputeConstantOpsAOTPass(exported_program),
                 FuseConstantArgsPass(exported_program),
 
@@ -34,6 +34,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
         graph = graph_module.graph
         modified = False
 
+        node_order = {node: index for index, node in enumerate(graph.nodes)}
         producers: Deque[Node] = deque(node for node in graph.nodes)
 
         while producers:
@@ -48,7 +49,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
             if len(user_nodes) < 2:
                 continue
 
-            candidate_groups = self._get_candidate_groups(user_nodes)
+            candidate_groups = self._get_candidate_groups(node_order, user_nodes)
 
             signature_to_user: Dict[Tuple[Hashable, ...], Node] = {}
             for group in candidate_groups:
@@ -84,7 +85,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
 
         return PassResult(graph_module, modified)
 
-    def _get_candidate_groups(self, user_nodes):
+    def _get_candidate_groups(self, node_order, user_nodes):
         users_by_target: Dict[Tuple[str, Hashable], List[Node]] = {}
         for user in user_nodes:
             if user.graph is None:
@@ -98,9 +99,12 @@ def _get_candidate_groups(self, user_nodes):
             target_signature = (user.op, target_key)
             users_by_target.setdefault(target_signature, []).append(user)
 
-        candidate_groups = [
-            group for group in users_by_target.values() if len(group) > 1
-        ]
+        candidate_groups = []
+        for group in users_by_target.values():
+            if len(group) > 1:
+                candidate_groups.append(
+                    sorted(group, key=lambda node: node_order[node])
+                )
 
         return candidate_groups
 
 
@@ -0,0 +1,170 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+
+from executorch.backends.arm._passes.fuse_constant_ops_pass import (
+    ComputeConstantOpsAOTPass,
+)
+from executorch.backends.arm.constants import NHWC_INVERSE_ORDER, NHWC_ORDER
+from executorch.backends.arm.tosa.specification import (
+    get_context_shape_env,
+    get_context_spec,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class RewriteAdaptiveAvgPool2dPass(ArmPass):
+    """Rewrite dynamic adaptive average pooling to tosa.avg_pool2d_adaptive when
+    possible.
+
+    The condition for rewriting is that symbolic input dimensions have a known
+    remainder of 0 or 1 when divided by the static output dimensions. This
+    preserves the adaptive pooling regions without materializing slice/cat
+    decomposition.
+
+    """
+
+    targeted_ops = {exir_ops.edge.aten._adaptive_avg_pool2d.default}
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ComputeConstantOpsAOTPass,
+    }
+
+    @staticmethod
+    def _is_symbolic_dim(dim) -> bool:
+        return isinstance(dim, torch.SymInt)
+
+    @staticmethod
+    def _supports_dynamic_tosa_adaptive() -> bool:
+        try:
+            tosa_spec = get_context_spec()
+        except Exception:
+            return False
+        return (
+            tosa_spec.version.major == 1
+            and tosa_spec.version.minor >= 1
+            and tosa_spec.support_extension("shape")
+        )
+
+    @classmethod
+    def _get_pool_params(cls, input_size, output_size: int):
+        if isinstance(output_size, torch.SymInt) or not isinstance(output_size, int):
+            return None
+
+        remainder = input_size % output_size
+        if cls._is_symbolic_dim(remainder):
+            shape_env = get_context_shape_env()
+            try:
+                remainder_range = shape_env.bound_sympy(remainder.node.expr)
+            except Exception:
+                return None
+
+            if not remainder_range.is_singleton() or int(remainder_range.upper) not in (
+                0,
+                1,
+            ):
+                return None
+
+            stride = input_size // output_size
+            return stride + int(remainder_range.upper), stride
+
+        if remainder not in (0, 1):
+            return None
+
+        stride = input_size // output_size
+        return stride + remainder, stride
+
+    def call_operator(self, op, args, kwargs, meta, updated=False):
+        if op not in self.targeted_ops:
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        x = args[0]
+        _, _, input_h, input_w = x.data.shape
+        if not (self._is_symbolic_dim(input_h) or self._is_symbolic_dim(input_w)):
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        # Dynamic adaptive lowering requires shape-aware TOSA support.
+        if not self._supports_dynamic_tosa_adaptive():
+            raise RuntimeError(
+                "Dynamic adaptive_avg_pool2d rewrite requires TOSA-1.1 with the shape extension."
+            )
+
+        output_h, output_w = args[1]
+        h_params = self._get_pool_params(input_h, output_h)
+        w_params = self._get_pool_params(input_w, output_w)
+        # Fall back when either spatial dimension cannot be expressed as one TOSA adaptive pool.
+        if h_params is None or w_params is None:
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        kernel = [h_params[0], w_params[0]]
+        stride = [h_params[1], w_params[1]]
+        pad = [0, 0, 0, 0]
+        pad = super().call_shape_operator(
+            exir_ops.backend.tosa.CONST_SHAPE.default,
+            (pad,),
+            {},
+            meta,
+        )
+        if all(isinstance(k, int) for k in kernel):
+            kernel = super().call_shape_operator(
+                exir_ops.backend.tosa.CONST_SHAPE.default,
+                (kernel,),
+                {},
+                meta,
+            )
+        if all(isinstance(s, int) for s in stride):
+            stride = super().call_shape_operator(
+                exir_ops.backend.tosa.CONST_SHAPE.default,
+                (stride,),
+                {},
+                meta,
+            )
+
+        in_qparams = meta.data.get("input_qparams", {})
+        in_zp_val = in_qparams[0].get_zp_per_tensor() if 0 in in_qparams else 0
+        input_zp = self.call_scalar(in_zp_val, meta)
+
+        out_qparams = meta.data.get("output_qparams", {})
+        out_zp_val = out_qparams[0].get_zp_per_tensor() if 0 in out_qparams else 0
+        output_zp = self.call_scalar(out_zp_val, meta)
+
+        acc_type = (
+            torch.int32 if x.data.dtype in (torch.int8, torch.int16) else torch.float32
+        )
+        pre_permute = super().call_operator(
+            exir_ops.edge.aten.permute_copy.default,
+            (x, list(NHWC_ORDER)),
+            {},
+            meta,
+            True,
+        )
+        tosa_args = (
+            pre_permute,
+            input_zp,
+            output_zp,
+            kernel,
+            stride,
+            pad,
+            acc_type,
+        )
+
+        tosa_avg_pool = super().call_operator(
+            exir_ops.backend.tosa.AVG_POOL2D_ADAPTIVE.default,
+            tosa_args,
+            {},
+            meta,
+            True,
+        )
+        return super().call_operator(
+            exir_ops.edge.aten.permute_copy.default,
+            (tosa_avg_pool, list(NHWC_INVERSE_ORDER)),
+            {},
+            meta,
+            True,
+        )
@@ -65,9 +65,11 @@ def call_operator(self, op, args, kwargs, meta, updated=False):
         # Materialize output zero-point as a scalar tensor
         output_zp = super().call_scalar(out_zp_val, meta)
 
-        # Determine accumulator dtype for AVG_POOL2D: INT32 for integer inputs, FP32 otherwise
+        # Determine accumulator dtype for AVG_POOL2D.
         if x.data.dtype in (torch.int8, torch.int16):
             acc_type = torch.int32
+        elif x.data.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+            acc_type = torch.float16
         else:
             acc_type = torch.float32
Original file line number	Diff line number	Diff line change
`@@ -149,6 +149,7 @@`
`149`	`149`	`from .replace_scalar_with_tensor_pass import ( # noqa`
`150`	`150`	`ReplaceScalarWithTensorByProfilePass,`
`151`	`151`	`)`
	`152`	`+from .rewrite_adaptive_avg_pool2d import RewriteAdaptiveAvgPool2dPass # noqa`
`152`	`153`	`from .rewrite_avg_pool2d_pass import RewriteAvgPool2dPass # noqa`
`153`	`154`	`from .rewrite_bool_bitwise_to_logical_pass import ( # noqa`
`154`	`155`	`RewriteBoolBitwiseToLogicalPass,`