NXP backend: Add support for softmax with the new Neutron flow.

irtrukhina · irtrukhina · commit 154ffff0e061 · 2026-05-21T11:51:35.000+02:00
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
+import torch
 
 from executorch.backends.nxp.backend.custom_delegation_options import (
     CustomDelegationOptions,
@@ -58,14 +59,43 @@ def _is_supported_on_target(
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        """Check if the softmax operation can be executed on Neutron hardware.
-
-        Hardware constraints:
+        if custom_delegation_options.use_new_flow_neutron_c:
+            """New flow: Hardware constraints for the new flow:
+            1. Input and Output must be INT8/UINT8
+            2. Channels < 4096 / num_pipes * 4
+            3. Total spatial size (N*H*W) <= 4096
+            4. (channels * spatial_size) / num_macs <= 65536
+            """
+            # Constraint 1: Input and Output must be INT8/UINT8.
+            supported_types = [torch.int8, torch.uint8]
+            if not NodeConverter.uses_quantization_type_for_io(
+                node, supported_types, [0], [0]
+            ):
+                return False
+
+            # Constraint 2: Channel size limit
+            num_pipes = neutron_target_spec.get_num_pipes()
+            channels = SoftmaxConverter._get_channels(node)
+            if channels >= 4096 / num_pipes * 4:
+                return False
+
+            # Constraint 3: Spatial size limit
+            total_spatial_size = SoftmaxConverter._get_total_spatial_size(node)
+            if total_spatial_size > 4096:
+                return False
+
+            # Constraint 4: Total processing size limit
+            num_macs = neutron_target_spec.get_num_macs()
+            if channels * total_spatial_size / num_macs > 65536:
+                return False
+
+            return True
+
+        """Old flow. Hardware constraints for the old flow:
         1. Input rank must be >= 2 (Neutron does not support 1D)
-        2. Channels must be a multiple of num_macs
-        3. Channels < 4096 / num_pipes * 4
-        4. Total spatial size (N*H*W) <= 4096
-        5. (channels * spatial_size) / num_macs <= 65536
+        2. Channels < 4096 / num_pipes * 4
+        3. Total spatial size (N*H*W) <= 4096
+        4. (channels * spatial_size) / num_macs <= 65536
         """
         input_shape = node.meta["val"].shape
 
@@ -78,19 +108,15 @@ def _is_supported_on_target(
         channels = SoftmaxConverter._get_channels(node)
         total_spatial_size = SoftmaxConverter._get_total_spatial_size(node)
 
-        # Constraint 2: Channels must be a multiple of num_macs
-        if channels % num_macs != 0:
-            return False
-
-        # Constraint 3: Channel size limit
+        # Constraint 2: Channel size limit
         if channels >= 4096 / num_pipes * 4:
             return False
 
-        # Constraint 4: Spatial size limit
+        # Constraint 3: Spatial size limit
         if total_spatial_size > 4096:
             return False
 
-        # Constraint 5: Total processing size limit
+        # Constraint 4: Total processing size limit
         if channels * total_spatial_size / num_macs > 65536:
             return False
 
diff --git a/backends/nxp/tests/generic_tests/test_cifarnet.py b/backends/nxp/tests/generic_tests/test_cifarnet.py
@@ -11,10 +11,7 @@
 from executorch.backends.nxp.tests.config_importer import test_config
 from executorch.backends.nxp.tests.dataset_creator import CopyDatasetCreator
 from executorch.backends.nxp.tests.executorch_pipeline import ModelInputSpec
-from executorch.backends.nxp.tests.graph_verifier import (
-    BaseGraphVerifier,
-    NonDelegatedNode,
-)
+from executorch.backends.nxp.tests.graph_verifier import BaseGraphVerifier
 from executorch.backends.nxp.tests.model_output_comparator import (
     NumericalStatsOutputComparator,
 )
@@ -56,17 +53,15 @@ def test_cifarnet(mocker, cifar_test_files, channels_last):
         model.to(memory_format=torch.channels_last)
         input_spec.dim_order = torch.channels_last
 
-    non_dlg_nodes = [NonDelegatedNode("aten__softmax_default", 1)]
-
     comparator = NumericalStatsOutputComparator(
-        max_mse_error=1.0e-3, is_classification_task=True
+        max_mse_error=2.0e-2, is_classification_task=True
     )
     lower_run_compare(
         model,
         [input_spec],
         dataset_creator=CopyDatasetCreator(cifar_test_files),
         output_comparator=comparator,
-        dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes),
+        dlg_model_verifier=BaseGraphVerifier(1, []),
         mocker=mocker,
         # Run the channels last reference in PyTorch as the ExecuTorch CPU model contains incorrectly
         #  lowered channels last convolution weights, which cause incorrect inference results. The issue
@@ -83,7 +78,6 @@ def test_cifarnet_qat(mocker, cifar_test_files):
     model = CifarNet().get_eager_model().eval()
 
     input_shape = (1, 3, 32, 32)
-    non_dlg_nodes = [NonDelegatedNode("aten__softmax_default", 1)]
 
     # The higher MSE threshold is due to using weaker "MovingAbs" observers instead of "MinMax" observers.
     # The "MovingAbs" observers capture only limited number of past calibration samples compared to "MinMax",
@@ -96,7 +90,7 @@ def test_cifarnet_qat(mocker, cifar_test_files):
         input_shape,
         dataset_creator=CopyDatasetCreator(cifar_test_files),
         output_comparator=comparator,
-        dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes),
+        dlg_model_verifier=BaseGraphVerifier(1, []),
         mocker=mocker,
         use_qat=True,
     )
diff --git a/backends/nxp/tests/generic_tests/test_integration.py b/backends/nxp/tests/generic_tests/test_integration.py
@@ -28,8 +28,8 @@ def test_conv_fc_softmax__to_executorch_program(use_qat):
 
     delegation_info = get_delegation_info(program.graph_module)
     assert delegation_info.num_delegated_subgraphs == 1
-    assert delegation_info.num_non_delegated_nodes == 11
-    assert delegation_info.num_delegated_nodes == 13
+    assert delegation_info.num_non_delegated_nodes == 5
+    assert delegation_info.num_delegated_nodes == 16
 
     for node in program.graph.nodes:
         # Make sure Convolution and AddMM are delegated
@@ -46,8 +46,8 @@ def test_cifarnet(use_qat):
 
     delegation_info = get_delegation_info(exec_prog.exported_program().graph_module)
     assert delegation_info.num_delegated_subgraphs == 1
-    assert delegation_info.num_non_delegated_nodes == 11
-    assert delegation_info.num_delegated_nodes == 45
+    assert delegation_info.num_non_delegated_nodes == 5
+    assert delegation_info.num_delegated_nodes == 48
 
     nodes = list(exec_prog.exported_program().graph.nodes)
     assert nodes[2].name == "quantized_decomposed_quantize_per_tensor_default"
diff --git a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py
@@ -85,7 +85,7 @@ def test_conv_fc__lowered_program_and_tflite_output_match(mocker):
     # No Transpose ops in produced TFLite model
     tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0)
 
-    assert tflite_subgraph.OperatorsLength() == 3
+    assert tflite_subgraph.OperatorsLength() == 4
     assert (
         tflite_subgraph.Operators(0).BuiltinOptionsType()
         == BuiltinOptions.Conv2DOptions
@@ -98,6 +98,10 @@ def test_conv_fc__lowered_program_and_tflite_output_match(mocker):
         tflite_subgraph.Operators(2).BuiltinOptionsType()
         == BuiltinOptions.FullyConnectedOptions
     )
+    assert (
+        tflite_subgraph.Operators(3).BuiltinOptionsType()
+        == BuiltinOptions.SoftmaxOptions
+    )
 
     # Verify outputs of program and TFLite model
     input_data = (
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
@@ -17,12 +17,19 @@
     ToChannelFirstPreprocess,
     ToChannelLastPreprocess,
 )
+
+from executorch.backends.nxp.tests.graph_verifier import BaseGraphVerifier
+
+from executorch.backends.nxp.tests.model_output_comparator import (
+    NumericalStatsOutputComparator,
+)
 from executorch.backends.nxp.tests.models import SoftmaxModule
-from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+from executorch.backends.nxp.tests.ops_aliases import Softmax
 
 # noinspection PyProtectedMember
+
 ExecutorchDelegateCall = torch._higher_order_ops.executorch_call_delegate
-Softmax = exir_ops.edge.aten._softmax.default
 
 
 @pytest.fixture(autouse=True)
@@ -207,3 +214,70 @@ def test_softmax_delegation__1d():
     model = SoftmaxModule(dim)
     delegated_ep = to_quantized_edge_program(model, input_shape).exported_program()
     assert_softmax_not_delegated(delegated_ep.graph)
+
+
+class TestSoftmaxNewNeutronFlow:
+    @pytest.mark.parametrize(
+        "input_shape, dim",
+        [
+            # Dim must always be the last dimension.
+            pytest.param((10,), -1, id="1D_dim_-1"),
+            pytest.param((5, 21), -1, id="2D_dim_-1"),
+            pytest.param((2, 3, 13), -1, id="3D_dim_-1"),
+            pytest.param((1, 3, 3, 200), -1, id="4D_dim_-1"),
+            pytest.param((5, 4, 3, 2, 180), -1, id="5D_dim_-1"),
+        ],
+    )
+    def test__basic_nsys_inference(self, input_shape, dim):
+        model = SoftmaxModule(dim)
+        graph_verifier = BaseGraphVerifier(
+            exp_num_delegate_call_nodes=1,  # Delegated Softmax.
+            exp_non_delegated_nodes=[],
+        )
+        output_comparator = NumericalStatsOutputComparator(
+            max_mse_error=0.001, is_classification_task=True
+        )
+        lower_run_compare(
+            model,
+            input_shape,
+            graph_verifier,
+            use_new_flow_neutron_c=True,
+            output_comparator=output_comparator,
+        )
+
+    @pytest.mark.parametrize(
+        "input_shape, dim",
+        [
+            pytest.param((4096, 8), -1, id="2D_spatial_size_limit"),
+            pytest.param((2040,), -1, id="1D_channels_limit"),
+            pytest.param((4096, 128), -1, id="2D_total_size_limit"),
+            pytest.param((1, 64, 64, 8), -1, id="4D_spatial_size_limit"),
+        ],
+    )
+    def test__limits(self, input_shape, dim, mocker):
+        model = SoftmaxModule(dim)
+        delegated_ep = to_quantized_edge_program(
+            model, input_shape, use_new_flow_neutron_c=True
+        ).exported_program()
+
+        # Make sure the `softmax` was delegated.
+        assert_softmax_delegated(delegated_ep.graph)
+
+    @pytest.mark.parametrize(
+        "input_shape, dim",
+        [
+            pytest.param((4097, 8), -1, id="2D_spatial_size_exceeded"),
+            pytest.param((2048,), -1, id="1D_channels_exceeded"),
+            pytest.param((4096, 129), -1, id="2D_total_size_exceeded"),
+            pytest.param((1, 64, 65, 8), -1, id="4D_spatial_size_exceeded"),
+        ],
+    )
+    def test__limits_exceeded(self, input_shape, dim):
+        model = SoftmaxModule(dim)
+        delegated_ep = to_quantized_edge_program(
+            model, input_shape, use_new_flow_neutron_c=True
+        ).exported_program()
+
+        # Make sure the `softmax` was NOT delegated.
+
+        assert_softmax_not_delegated(delegated_ep.graph)
diff --git a/backends/nxp/tests/ir/edge_passes/test_remove_io_quant_ops_pass.py b/backends/nxp/tests/ir/edge_passes/test_remove_io_quant_ops_pass.py
@@ -62,12 +62,12 @@ def test_remove_io_quant_ops_pass__cifarnet():
     )
 
     nodes = list(exec_prog.exported_program().graph.nodes)
-    assert len(nodes) == 11
+    assert len(nodes) == 5
     assert (
         nodes[0].meta["val"].dtype == torch.int8
     ), "Input tensor doesn't have type INT8."
     assert (
-        nodes[10].meta["val"][0].dtype == torch.int8
+        nodes[4].meta["val"][0].dtype == torch.int8
     ), "Output tensor doesn't have type INT8."
 
     assert (