Qualcomm AI Engine Direct - Adding QNN backend support for div.Tensor_mode core ATen op (#19785)

qti-horodnic · web-flow · commit 9c4471c620a7 · 2026-06-15T19:49:42.000-04:00
### Summary Added support for the core ATen op `div.Tensor_mode` using a decomposition pass and the `div, trunc, floor` ops, based on the selected mode: ``` div(x, y, rounding_mode=None) -> div(x, y) div(x, y, rounding_mode="trunc") -> trunc(div(x, y)) div(x, y, rounding_mode="floor") -> floor(div(x, y)) ``` ### Test plan ``` python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_div_mode --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNFloatingPointOperator.test_qnn_backend_div_mode --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_div_scalar_mode --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNFloatingPointOperator.test_qnn_backend_div_scalar_mode --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android ``` cc @cccclai @cbilgin @abhinaykukkadapu
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
@@ -19,6 +19,7 @@
 from .decompose_binary_alpha import DecomposeBinaryAlpha
 from .decompose_cdist import DecomposeCDist
 from .decompose_col_im import DecomposeColIm
+from .decompose_div_mode import DecomposeDivMode
 from .decompose_einsum import DecomposeEinsum
 from .decompose_expm1 import DecomposeExpM1
 from .decompose_fill import DecomposeFill
@@ -82,6 +83,7 @@
     DecomposeBinaryAlpha,
     DecomposeCDist,
     DecomposeColIm,
+    DecomposeDivMode,
     DecomposeEinsum,
     DecomposeExpM1,
     DecomposeFill,
diff --git a/backends/qualcomm/_passes/decompose_div_mode.py b/backends/qualcomm/_passes/decompose_div_mode.py
@@ -0,0 +1,91 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta
+
+
+class DecomposeDivMode(ExportPass):
+    """
+    Decompose aten.div.Tensor_mode into supported primitives.
+
+    div(x, y, rounding_mode=None)    -> div(x, y)
+    div(x, y, rounding_mode="trunc") -> trunc(div(x, y))
+    div(x, y, rounding_mode="floor") -> floor(div(x, y))
+
+    Note: div.Scalar_mode is handled by LiftConstantScalarOperands which converts it to div.Tensor_mode before this pass runs.
+    """
+
+    def __init__(self):
+        super(DecomposeDivMode, self).__init__()
+        self.targets = {
+            torch.ops.aten.div.Tensor_mode,
+            exir_ops.edge.aten.div.Tensor_mode,
+        }
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+
+        for node in list(graph.nodes):
+            if node.op == "call_function" and node.target in self.targets:
+                is_edge = isinstance(node.target, EdgeOpOverload)
+                meta = node.meta
+
+                x_node = node.args[0]
+                y_node = node.args[1]
+
+                rounding_mode = node.kwargs.get("rounding_mode", None)
+                if rounding_mode is None and len(node.args) > 2:
+                    rounding_mode = node.args[2]
+
+                div_op = (
+                    exir_ops.edge.aten.div.Tensor
+                    if is_edge
+                    else torch.ops.aten.div.Tensor
+                )
+
+                with graph.inserting_before(node):
+                    # Step 1: div_result = div(x, y)
+                    div_node = graph.create_node(
+                        "call_function", div_op, (x_node, y_node)
+                    )
+                    div_node.meta = copy_meta(meta)
+
+                    # Step 2: Apply rounding mode if needed
+                    if rounding_mode == "trunc":
+                        trunc_op = (
+                            exir_ops.edge.aten.trunc.default
+                            if is_edge
+                            else torch.ops.aten.trunc.default
+                        )
+                        result_node = graph.create_node(
+                            "call_function", trunc_op, (div_node,)
+                        )
+                        result_node.meta = copy_meta(meta)
+                    elif rounding_mode == "floor":
+                        floor_op = (
+                            exir_ops.edge.aten.floor.default
+                            if is_edge
+                            else torch.ops.aten.floor.default
+                        )
+                        result_node = graph.create_node(
+                            "call_function", floor_op, (div_node,)
+                        )
+                        result_node.meta = copy_meta(meta)
+                    else:
+                        # rounding_mode=None: plain division
+                        result_node = div_node
+
+                for user in node.users.copy():
+                    user.replace_input_with(node, result_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
@@ -43,6 +43,7 @@ class TensorOpInfo:
     # For below cases, refer to LiftAddTensor Model in UT for sample
     aten.add.Tensor: TensorOpInfo(aten.add.Tensor, False, False),
     aten.div.Scalar: TensorOpInfo(aten.div.Tensor, False, False),
+    aten.div.Scalar_mode: TensorOpInfo(aten.div.Tensor_mode, False, False),
     aten.mul.Scalar: TensorOpInfo(aten.mul.Tensor, False, False),
     aten.rsub.Scalar: TensorOpInfo(aten.rsub.Tensor, False, False),
     aten.sub.Scalar: TensorOpInfo(aten.sub.Tensor, False, False),
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -25,6 +25,7 @@
     DecomposeBinaryAlpha,
     DecomposeCDist,
     DecomposeColIm,
+    DecomposeDivMode,
     DecomposeEinsum,
     DecomposeExpM1,
     DecomposeFill,
@@ -127,6 +128,7 @@ def get_default_pass_activations(cls):
             (DecomposeAtan2, True),
             (DecomposeColIm, True),
             (DecomposeCDist, True),
+            (DecomposeDivMode, True),
             (DecomposeFill, True),
             (DecomposeLogVariants, True),
             (DecomposeMaxPool3d, True),
@@ -164,6 +166,7 @@ def get_annotation_passes(cls):
             DecomposeAtan2,
             DecomposeBinaryAlpha,
             DecomposeCDist,
+            DecomposeDivMode,
             DecomposeMaxPool3d,
             DecomposePad,
             DecomposeScaledDotProductAttention,
@@ -280,6 +283,7 @@ def get_passes_dependency_for_capture_program(cls):
             DecomposeAtan2: [RemoveRedundancy],
             DecomposeColIm: [FoldQDQ],
             DecomposeCDist: [RemoveRedundancy],
+            DecomposeDivMode: [RemoveRedundancy],
             DecomposeFill: [RemoveRedundancy],
             DecomposeLinalgVectorNorm: [RemoveRedundancy],
             DecomposeLogVariants: [RemoveRedundancy],
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
@@ -503,6 +503,8 @@ The following PyTorch operators are supported through decomposition or annotatio
 | `aten.atan2.default`, `aten.atan2.out` | `DecomposeAtan2` |
 | `aten.add` (with alpha), `aten.sub` (with alpha) | `DecomposeBinaryAlpha` |
 | `aten.cdist`, `aten._cdist_forward` | `DecomposeCDist` |
+| `aten.div.Tensor_mode` | `DecomposeDivMode` |
+| `aten.div.Scalar_mode` | `LiftConstantScalarOperands` → `DecomposeDivMode` |
 | `aten.im2col`, `aten.col2im` | `DecomposeColIm` |
 | `aten.einsum` | `DecomposeEinsum` |
 | `aten.special_expm1` | `DecomposeExpM1` |
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
@@ -20,7 +20,6 @@
 
 to_be_implemented_operator = [
     exir_ops.edge.aten.adaptive_max_pool3d.default,
-    exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.max_pool3d_with_indices.default,
     exir_ops.edge.aten.median.default,
     exir_ops.edge.aten.median.dim,
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -1021,6 +1021,25 @@ def forward(self, x):
         return x / 10
 
 
+class DivMode(torch.nn.Module):
+    def __init__(self, rounding_mode=None):
+        super().__init__()
+        self.rounding_mode = rounding_mode
+
+    def forward(self, x, y):
+        return torch.div(x, y, rounding_mode=self.rounding_mode)
+
+
+class DivScalarMode(torch.nn.Module):
+    def __init__(self, scalar=2.0, rounding_mode=None):
+        super().__init__()
+        self.scalar = scalar
+        self.rounding_mode = rounding_mode
+
+    def forward(self, x):
+        return torch.div(x, self.scalar, rounding_mode=self.rounding_mode)
+
+
 class DrawGraphModel(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -633,6 +633,52 @@ def test_qnn_backend_cumsum(self):
                         index += 1
                         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_div_mode(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    DivMode(rounding_mode=None),  # noqa: F405
+                    DivMode(rounding_mode="trunc"),  # noqa: F405
+                    DivMode(rounding_mode="floor"),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (
+                        torch.tensor([7.0, 5.0, -3.0, 8.0, 1.0, 9.0]).reshape(2, 3),
+                        torch.tensor([2.0, 3.0, 2.0, 5.0, 4.0, 2.0]).reshape(2, 3),
+                    ),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_div_scalar_mode(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    DivScalarMode(scalar=2.0, rounding_mode="trunc"),  # noqa: F405
+                    DivScalarMode(scalar=3.0, rounding_mode="floor"),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.tensor([7.0, 5.0, -3.0, 8.0, 1.0, 9.0]).reshape(2, 3),),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
         x = torch.randn(5)
@@ -3434,6 +3480,54 @@ def test_qnn_backend_cumsum(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_div_mode(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    DivMode(rounding_mode=None),  # noqa: F405
+                    DivMode(rounding_mode="trunc"),  # noqa: F405
+                    DivMode(rounding_mode="floor"),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (
+                        torch.tensor([7.0, 5.0, -3.0, 8.0, 1.0, 9.0]).reshape(2, 3),
+                        torch.tensor([2.0, 3.0, 2.0, 5.0, 4.0, 2.0]).reshape(2, 3),
+                    ),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        qdq_module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(qdq_module, sample_input)
+
+    def test_qnn_backend_div_scalar_mode(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    DivScalarMode(scalar=2.0, rounding_mode="trunc"),  # noqa: F405
+                    DivScalarMode(scalar=3.0, rounding_mode="floor"),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.tensor([7.0, 5.0, -3.0, 8.0, 1.0, 9.0]).reshape(2, 3),),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        qdq_module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(qdq_module, sample_input)
+
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
         x = torch.randn(5)