From c06b9f37d46962476b48fa4537dd338daf515795 Mon Sep 17 00:00:00 2001
From: Martin Pavella <martin.pavella@nxp.com>
Date: Tue, 16 Jun 2026 16:56:51 +0200
Subject: [PATCH 1/2] NXP backend: Enable `aten.bmm` with new Neutron flow.

---
 .../ops_converters/bmm_converter.py           |  46 +--
 backends/nxp/quantizer/patterns.py            |  21 +-
 .../node_converter/test_bmm_converter.py      | 285 ++++++------------
 backends/nxp/tests/models.py                  |  21 +-
 backends/nxp/tests/ops_aliases.py             |   2 +-
 5 files changed, 135 insertions(+), 240 deletions(-)

diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/bmm_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/bmm_converter.py
index 08b8533a89a..d31522a665d 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/bmm_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/bmm_converter.py
@@ -2,9 +2,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import torch
 
 from executorch.backends.nxp.backend.data_format import NXP_NODE_FORMAT
-from executorch.backends.nxp.backend.edge_helper import input_rank
+from executorch.backends.nxp.backend.edge_helper import (
+    get_quantization_parameters_for,
+    input_rank,
+)
 from executorch.backends.nxp.backend.ir.converter.conversion import translator
 from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
@@ -14,9 +18,6 @@
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     batch_mat_mul_options,
 )
-from executorch.backends.nxp.backend.neutron_operator_support import (
-    transposition_is_supported_on_neutron,
-)
 from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
@@ -44,35 +45,18 @@ def _is_supported_on_target(
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        is_ch_first_1 = node.args[0].meta[NXP_NODE_FORMAT].is_channels_first()
-        is_ch_first_2 = node.args[1].meta[NXP_NODE_FORMAT].is_channels_first()
-        # This combination of node formats is not supported on Neutron (`adj_x = True`, `adj_y = False`),
-        # but it should never happen because both input tensors are expected to share the same format.
-        if is_ch_first_1 and not is_ch_first_2:
+        if not NodeConverter.uses_quantization_type_for_io(
+            node,
+            supported_types=[torch.int8, torch.uint8],
+            input_indices=[0, 1],
+            output_indices=[0],
+        ):
             return False
 
-        # In case we need to insert transpose after `BatchMatMul`, we also need to check if
-        # such transposition is supported.
-        if node.meta[NXP_NODE_FORMAT].is_channels_first():
-            tensor_shape = node.meta["val"].shape
-            tensor_rank = len(tensor_shape)
-            perm = translator.create_channels_first_to_channels_last_permutation(
-                tensor_rank, return_list=True
-            )
-
-            tensor_shape_channels_last = [tensor_shape[i] for i in perm]
-            if not transposition_is_supported_on_neutron(
-                tensor_shape_channels_last, perm, neutron_target_spec
-            ):
-                return False
-
-        _, d1, d2 = node.args[0].meta["val"].shape
-        _, d3, d4 = node.args[1].meta["val"].shape
-
-        # The Neutron converter requires that every dimension participating in the
-        # multiplication is divisible by NUM_MACS.
-        num_macs = neutron_target_spec.get_num_macs()
-        if not all(m % num_macs == 0 for m in [d1, d2, d3, d4]):
+        _, input_1_zp = get_quantization_parameters_for(node.args[0])
+        _, input_2_zp = get_quantization_parameters_for(node.args[1])
+        if not (input_1_zp == input_2_zp == 0):
+            # Neutron requirement.
             return False
 
         return True
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index 9e21e4f1660..841f3897d6e 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -10,6 +10,7 @@
 from functools import partial
 
 import torch
+
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.clamp_converter import (
     _is_convertible_to_relu,
 )
@@ -22,6 +23,8 @@
 from torch.fx import Node
 from torchao.quantization.pt2e import (
     FakeQuantize,
+    MinMaxObserver,
+    MovingAverageMinMaxObserver,
     MovingAveragePerChannelMinMaxObserver,
     PerChannelMinMaxObserver,
 )
@@ -326,10 +329,24 @@ def get_anchors(
     ) -> PartitionAnchors | None:
         bmm_node = fused_partition[0].nodes[-1]
 
+        # Use per_tensor_symmetric to enforce zero_point=0 for both inputs
+        observer_or_fake_quant_ctr = (
+            FakeQuantize.with_args(observer=MovingAverageMinMaxObserver)
+            if self.is_qat
+            else MinMaxObserver
+        )
+        input_quantization_spec = QuantizationSpec(
+            dtype=torch.int8,
+            observer_or_fake_quant_ctr=observer_or_fake_quant_ctr,
+            quant_min=-128,
+            quant_max=127,
+            qscheme=torch.per_tensor_symmetric,  # Neutron requires the inputs to have zero point = 0.
+        )
+
         return PartitionAnchors(
             inputs=[
-                (bmm_node, NodeArgsIdx(0)),
-                (bmm_node, NodeArgsIdx(1)),
+                (bmm_node, NodeArgsIdx(0), input_quantization_spec),
+                (bmm_node, NodeArgsIdx(1), input_quantization_spec),
             ],
             biases=[],
             output=[(bmm_node,)],
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
index dc442a4931c..4636e6205af 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
@@ -3,29 +3,26 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import numpy as np
+# noinspection PyUnusedImports
 import pytest
 import torch
-from executorch.backends.nxp.backend.edge_program_converter import (
-    EdgeProgramToIRConverter,
-)
-from executorch.backends.nxp.backend.ir.converter.conversion import translator
-from executorch.backends.nxp.backend.neutron_operator_support import (
-    transposition_is_supported_on_neutron,
+
+from executorch.backends.nxp.edge_passes.move_auxiliary_operator_into_separate_qdq_cluster_pass import (
+    ViewCopy,
 )
-from executorch.backends.nxp.tests.executorch_pipeline import (
-    neutron_target_spec,
-    to_quantized_edge_program,
+from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator
+from executorch.backends.nxp.tests.executorch_pipeline import ModelInputSpec
+from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier
+from executorch.backends.nxp.tests.model_output_comparator import (
+    AllCloseOutputComparator,
 )
-from executorch.backends.nxp.tests.executors import (
-    convert_run_compare,
-    graph_contains_any_of_ops,
-    ToChannelFirstPreprocess,
-    ToChannelLastPreprocess,
+from executorch.backends.nxp.tests.models import (
+    BatchMatMulMaxPoolModel,
+    BatchMatMulModel,
 )
-from executorch.backends.nxp.tests.models import BatchMatMulConvModel, BatchMatMulModel
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+from executorch.backends.nxp.tests.ops_aliases import BMM, GetItem, MaxPool2DWithIndices
 from executorch.backends.nxp.tests.use_qat import *  # noqa F403
-from executorch.exir.dialects._ops import ops as exir_ops
 
 
 @pytest.fixture(autouse=True)
@@ -33,186 +30,86 @@ def reseed_model_per_test_run():
     torch.manual_seed(23)
 
 
-ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate
-Bmm = exir_ops.edge.aten.bmm.default
-
-
-@pytest.mark.parametrize(
-    "input_shape_x1, input_shape_x2",
-    [
-        pytest.param((1, 24, 16), (1, 16, 24), id="3D, one batch."),
-        pytest.param((3, 8, 24), (3, 24, 8), id="3D, more batches."),
-        pytest.param((2, 24, 16), (2, 16, 8), id="3D, more batches, x1_C != x2_W"),
-    ],
-)
-def test_convert_bmm__supported(mocker, input_shape_x1, input_shape_x2, use_qat):
-    model = BatchMatMulModel()
-
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
-    delegated_ep = to_quantized_edge_program(
-        model, [input_shape_x1, input_shape_x2], use_qat=use_qat
-    ).exported_program()
+class TestBMM:
 
-    # Make sure the `bmm` was delegated.
-    assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
-    assert not graph_contains_any_of_ops(delegated_ep.graph, [Bmm])
-
-    # Verify correct behavior of the converted NeutronIR model.
-    intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
-
-    input_data_1 = (
-        np.random.random(input_shape_x1).astype(np.float32) * 256.0 - 128.0
-    ).astype(np.int8)
-    input_data_2 = (
-        np.random.random(input_shape_x2).astype(np.float32) * 256.0 - 128.0
-    ).astype(np.int8)
-
-    # Make sure the tested program contains the `bmm`.
-    assert graph_contains_any_of_ops(intermediate_ep.graph, [Bmm])
-
-    # Verify that the delegated `bmm` node produces correct results
-    # The delegated `bmm` runs with a numerical tolerance of atol = 1
-    convert_run_compare(
-        intermediate_ep,
-        tfl_model=neutron_ir_model,
-        input_data={
-            0: input_data_1,
-            1: input_data_2,
-        },
-        atol=1,
-    )
-
-
-@pytest.mark.parametrize(
-    "input_shape_x1, input_shape_x2",
-    [
+    # noinspection PyMethodMayBeStatic
+    def assert_delegated(
+        self,
+        model,
+        input_1_shape,
+        input_2_shape,
+        mocker,
+        use_qat=False,
+        expected_delegated_ops=None,
+    ):
+        if expected_delegated_ops is None:
+            expected_delegated_ops = {BMM: 1}
+
+        graph_verifier = DetailedGraphVerifier(
+            mocker,
+            expected_delegated_ops=expected_delegated_ops,
+            expected_non_delegated_ops={},
+        )
+
+        # Use quantized dataset and allow a single-bit error.
+        remove_quant_io_ops = True
+        output_comparator = AllCloseOutputComparator(atol=1)
+
+        # Cover also negative values to thoroughly test the operator.
+        dataset_creator = RandomDatasetCreator(low=-2, high=2)
+
+        lower_run_compare(
+            model,
+            [ModelInputSpec(input_1_shape), ModelInputSpec(input_2_shape)],
+            graph_verifier,
+            dataset_creator,
+            output_comparator=output_comparator,
+            use_qat=use_qat,
+            remove_quant_io_ops=remove_quant_io_ops,
+        )
+
+    def test__qat(self, mocker, use_qat):
+        input_1_shape, input_2_shape = (1, 24, 16), (1, 16, 24)
+        model = BatchMatMulModel()
+        self.assert_delegated(
+            model, input_1_shape, input_2_shape, mocker, use_qat=use_qat
+        )
+
+    # Input shape parameters used by 2 tests in this class.
+    BMM_SHAPES = [
+        pytest.param((3, 8, 24), (3, 24, 8), id="more batches"),
+        pytest.param((2, 24, 16), (2, 16, 8), id="more batches, x1_C != x2_W"),
         pytest.param(
-            (1, 8, 7), (1, 7, 16), id="3D, x1_W (and x2_C) not divisible by NUM_MACS."
+            (1, 8, 7), (1, 7, 16), id="x1_W (and x2_C) not divisible by NUM_MACS"
         ),
-        pytest.param((1, 7, 16), (1, 16, 8), id="3D, x1_C not divisible by NUM_MACS."),
-        pytest.param((1, 8, 16), (1, 16, 7), id="3D, x2_W not divisible by NUM_MACS."),
-    ],
-)
-def test_convert_bmm__unsupported_shape(input_shape_x1, input_shape_x2, use_qat):
-    model = BatchMatMulModel()
-
-    delegated_ep = to_quantized_edge_program(
-        model, [input_shape_x1, input_shape_x2], use_qat=use_qat
-    ).exported_program()
-
-    # Make sure the `bmm` was NOT delegated.
-    assert graph_contains_any_of_ops(delegated_ep.graph, [Bmm])
-
-
-def test_convert_bmm__unsupported_dim_order(mocker, use_qat):
-    pytest.xfail(
-        "`test_convert_bmm__unsupported_dim_order` is invalid due to incorrect propagation of node format "
-        "through `view_copy` nodes introduced by the aten pass that converts `conv1d` to `conv2d` "
-        "in the test model. `NodeFormatInference` needs to be updated to propagate the "
-        "`channels_first` format only when the batch or channel dimension is modified by the `view_copy` "
-        "or by other nodes."
+        pytest.param((1, 7, 16), (1, 16, 8), id="x1_C not divisible by NUM_MACS"),
+        pytest.param((1, 8, 16), (1, 16, 7), id="x2_W not divisible by NUM_MACS"),
+        pytest.param((3, 5, 7), (3, 7, 11), id="nothing divisible by NUM_MACS"),
+    ]
+
+    @pytest.mark.parametrize(
+        "input_1_shape, input_2_shape",
+        BMM_SHAPES,
     )
+    def test__nsys_inference(self, mocker, input_1_shape, input_2_shape):
+        model = BatchMatMulModel()
+        self.assert_delegated(model, input_1_shape, input_2_shape, mocker)
 
-    n1 = n2 = 5
-    w1 = c2 = 16
-    c1 = 8
-    w2 = 24
-
-    x_input_shape = (n1, c1, w1)
-    y_input_shape = (n2, c2, w2)
-
-    model = BatchMatMulConvModel(in_channels=c1, out_channels=c1)
-
-    delegated_ep = to_quantized_edge_program(
-        model,
-        [x_input_shape, y_input_shape],
-        use_neutron_for_format_conversion=False,
-        use_qat=use_qat,
-    ).exported_program()
-
-    # Make sure the `bmm` was NOT delegated.
-    # For `bmm` to work in channels-first order, support for 3D `transpose` is needed,
-    # which is not implemented in NXP Executorch backend yet.
-    assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
-    assert graph_contains_any_of_ops(delegated_ep.graph, [Bmm])
-
-
-def test_convert_bmm__channels_first(mocker, use_qat):
-    pytest.xfail(
-        "`test_convert_bmm__channels_first` is invalid due to incorrect propagation of node format "
-        "through `view_copy` nodes introduced by the aten pass that converts `conv1d` to `conv2d` "
-        "in the test model. `NodeFormatInference` needs to be updated to propagate the "
-        "`channels_first` format only when the batch or channel dimension is modified by the `view_copy` "
-        "or by other nodes."
-    )
-    # These must match:
-    # - `n1 = n2`
-    # - `w1 = c2`
-    # Otherwise it violates `bmm` constraints per mathematical definition.
-    n1 = n2 = 5
-    w1 = c2 = 16
-
-    # `c1`, `w1`, `c2`, `w2` also need to be divisible by `num_macs`.
-    c1 = 8
-    w2 = 24
-
-    x_input_shape = (n1, c1, w1)
-    y_input_shape = (n2, c2, w2)
-
-    # Channels-last shape of the output before the newly-inserted `transpose`
-    # converts it to channels-first
-    output_shape = (n1, w2, c1)
-
-    perm = translator.create_channels_first_to_channels_last_permutation(
-        len(output_shape), return_list=True
-    )
-    transp_not_supported = not transposition_is_supported_on_neutron(
-        output_shape, perm, neutron_target_spec
-    )
-    if transp_not_supported:
-        pytest.skip("3D dim order swap not implemented.")
-
-    model = BatchMatMulConvModel(in_channels=c1, out_channels=c1)
-
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
-    delegated_ep = to_quantized_edge_program(
-        model,
-        [x_input_shape, y_input_shape],
-        use_neutron_for_format_conversion=False,
-        use_qat=use_qat,
-    ).exported_program()
-
-    # Make sure the `bmm` was delegated.
-    assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall])
-    assert not graph_contains_any_of_ops(delegated_ep.graph, [Bmm])
-
-    # Verify correct behavior of the converted NeutronIR model.
-    neutron_ir_model = converter_spy.spy_return[0]
-    bmm_intermediate_ep = converter_spy.call_args.args[1]
-
-    input_data_1 = (
-        np.random.random(x_input_shape).astype(np.float32) * 256.0 - 128.0
-    ).astype(np.int8)
-    input_data_2 = (
-        np.random.random(y_input_shape).astype(np.float32) * 256.0 - 128.0
-    ).astype(np.int8)
-
-    # Make sure the tested program contains the `bmm`.
-    assert graph_contains_any_of_ops(bmm_intermediate_ep.graph, [Bmm])
-
-    # Verify that the delegated `bmm` node produces correct results
-    # The delegated `bmm` runs with a numerical tolerance of atol = 1.
-    # The `intermediate_ep` has input positions swapped.
-    input_data = {
-        0: input_data_2,
-        1: input_data_1,
-    }
-    convert_run_compare(
-        bmm_intermediate_ep,
-        tfl_model=neutron_ir_model,
-        input_data=input_data,
-        atol=1,
-        tflite_input_preprocess=ToChannelLastPreprocess(),
-        tflite_output_preprocess=ToChannelFirstPreprocess(),
+    @pytest.mark.parametrize(
+        "input_1_shape, input_2_shape",
+        BMM_SHAPES,
     )
+    def test__channels_first(self, mocker, input_1_shape, input_2_shape):
+        model = BatchMatMulMaxPoolModel()
+        self.assert_delegated(
+            model,
+            input_1_shape,
+            input_2_shape,
+            mocker,
+            expected_delegated_ops={
+                BMM: 1,
+                MaxPool2DWithIndices: 1,
+                GetItem: 1,
+                ViewCopy: 2,
+            },
+        )
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
index 7545dd940f2..d43fc336bc5 100644
--- a/backends/nxp/tests/models.py
+++ b/backends/nxp/tests/models.py
@@ -946,17 +946,14 @@ def forward(self, x, y):
         return torch.bmm(x, y)
 
 
-class BatchMatMulConvModel(torch.nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.conv = Conv1dModule(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            stride=1,
-            padding=1,
-            kernel_size=3,
-        )
+class BatchMatMulMaxPoolModel(torch.nn.Module):
+
+    @staticmethod
+    def noop_max_pool_1d(x):
+        """Call `torch.max_pool1d` that is a NoOp, but it enforces the ChannelsFirst format in the `NodeFormatInference`."""
+        return torch.max_pool1d(x, kernel_size=1)
 
     def forward(self, x, y):
-        x = self.conv(x)
-        return torch.bmm(x, y)
+        x = torch.bmm(x, y)
+        x = self.noop_max_pool_1d(x)
+        return x
diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py
index aceb9707106..f16ab66c528 100644
--- a/backends/nxp/tests/ops_aliases.py
+++ b/backends/nxp/tests/ops_aliases.py
@@ -16,7 +16,7 @@
 AddMm = exir_ops.edge.aten.addmm.default
 AddTensor = exir_ops.edge.aten.add.Tensor
 AvgPool2D = exir_ops.edge.aten.avg_pool2d.default
-Bmm = exir_ops.edge.aten.bmm.default
+BMM = exir_ops.edge.aten.bmm.default
 Cat = exir_ops.edge.aten.cat.default
 Clamp = exir_ops.edge.aten.clamp.default
 Clone = exir_ops.edge.aten.clone.default

From bfe6e5e73443f9be0cbd5be582fc7d872bc5bae9 Mon Sep 17 00:00:00 2001
From: Martin Pavella <martin.pavella@nxp.com>
Date: Fri, 19 Jun 2026 09:56:14 +0200
Subject: [PATCH 2/2] Resolve conflicts.

---
 .../converter/node_converter/test_bmm_converter.py  | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
index 4636e6205af..c564c024623 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
@@ -39,6 +39,7 @@ def assert_delegated(
         input_1_shape,
         input_2_shape,
         mocker,
+        request,
         use_qat=False,
         expected_delegated_ops=None,
     ):
@@ -62,17 +63,18 @@ def assert_delegated(
             model,
             [ModelInputSpec(input_1_shape), ModelInputSpec(input_2_shape)],
             graph_verifier,
+            request,
             dataset_creator,
             output_comparator=output_comparator,
             use_qat=use_qat,
             remove_quant_io_ops=remove_quant_io_ops,
         )
 
-    def test__qat(self, mocker, use_qat):
+    def test__qat(self, mocker, request, use_qat):
         input_1_shape, input_2_shape = (1, 24, 16), (1, 16, 24)
         model = BatchMatMulModel()
         self.assert_delegated(
-            model, input_1_shape, input_2_shape, mocker, use_qat=use_qat
+            model, input_1_shape, input_2_shape, mocker, request, use_qat=use_qat
         )
 
     # Input shape parameters used by 2 tests in this class.
@@ -91,21 +93,22 @@ def test__qat(self, mocker, use_qat):
         "input_1_shape, input_2_shape",
         BMM_SHAPES,
     )
-    def test__nsys_inference(self, mocker, input_1_shape, input_2_shape):
+    def test__nsys_inference(self, mocker, request, input_1_shape, input_2_shape):
         model = BatchMatMulModel()
-        self.assert_delegated(model, input_1_shape, input_2_shape, mocker)
+        self.assert_delegated(model, input_1_shape, input_2_shape, mocker, request)
 
     @pytest.mark.parametrize(
         "input_1_shape, input_2_shape",
         BMM_SHAPES,
     )
-    def test__channels_first(self, mocker, input_1_shape, input_2_shape):
+    def test__channels_first(self, mocker, request, input_1_shape, input_2_shape):
         model = BatchMatMulMaxPoolModel()
         self.assert_delegated(
             model,
             input_1_shape,
             input_2_shape,
             mocker,
+            request,
             expected_delegated_ops={
                 BMM: 1,
                 MaxPool2DWithIndices: 1,