Use CapabilityBasedPartitioner in AotiPartitioner (#20384)

shoumikhin · facebook-github-bot · commit a67fd358796c · 2026-06-18T13:29:37.000-07:00
Summary:

AotiPartitioner (the base for the CUDA and Metal backends) groups the ops it
delegates into one partition, by hand. Every other ExecuTorch backend (XNNPACK,
Vulkan, CoreML) uses the shared CapabilityBasedPartitioner helper instead. This
switches AotiPartitioner to that helper too.

Why:
1. Consistency -- same partitioning path as the other backends, and a real
   OperatorSupport hook instead of a hand-rolled tagging loop.
2. It can break. A delegate has to be one connected chunk of the graph. If the
   ops being delegated aren't all next to each other (some other node sits in
   between), putting them all in one partition is invalid and lowering crashes
   with "AssertionError: Invalid partition, found dependency cycles".
   CapabilityBasedPartitioner returns several maximal convex partitions instead,
   each of which fuses cleanly.

No change for the common case: if every op can be delegated, you still get
exactly one partition (no extra delegate boundaries). When a non-delegated node
splits the delegated ops, this emits one partition (and one delegate boundary)
per island, which is the cost of producing a valid program. Control-flow ops
(cond/map/while_loop/scan) keep their branch get_attr operands in the same
partition, and constant/buffer tagging is unchanged.

Reviewed By: Gasoonjia

Differential Revision: D109040727
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Mapping, Optional, Tuple
 
 import torch
 from executorch.exir._warnings import experimental
@@ -21,6 +21,8 @@
 )
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.operator_support import OperatorSupportBase
 
 
 @experimental(
@@ -30,12 +32,10 @@ class AotiPartitioner(Partitioner):
     """
     Base partitioner for AOTInductor-driven backend integration.
 
-    This partitioner creates a single partition containing all operators from the input graph.
-    It skips core ATen decomposition, allowing the backend to handle decomposition using
+    Delegates the non-lowered operators to AOTInductor as one or more convex
+    partitions (a single partition when nothing else has claimed part of the
+    graph). It skips core ATen decomposition, letting the backend decompose via
     AOTInductor's backend-specific decomposition table.
-
-    Only operators that cannot be handled by the aoti library will be excluded from
-    the partition and fall back to ExecuTorch's default or custom handling.
     """
 
     def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None:
@@ -49,62 +49,76 @@ def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None:
         self.delegation_spec = DelegationSpec(backend_name, compile_spec)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
-        """
-        Fully delegate the graph to AOTInductor by tagging all nodes as a single partition.
-        """
+        """Delegate the non-lowered ops to AOTInductor.
 
-        partition_tags: Dict[str, DelegationSpec] = {}
-        tag = "tag0"
-
-        # Tag torch.cond and other control flow operations
-        def is_control_flow(node: torch.fx.Node) -> bool:
-            return node.op == "call_function" and node.target in [
-                torch.ops.higher_order.cond,
-                torch.ops.higher_order.map_impl,
-                torch.ops.higher_order.while_loop,
-            ]
-
-        # Nodes already lowered by an earlier partitioner (e.g. a preceding
-        # TensorRT partition) appear as executorch_call_delegate calls and their
-        # output getitems; re-delegating them would nest a foreign delegate. Tag
-        # only the remaining non-lowered ops so this partitioner composes after
-        # others.
+        Uses CapabilityBasedPartitioner rather than a single tag because a
+        delegated submodule must be convex: if a node that is not delegated sits
+        between the delegated ops, one tag would span a non-convex set and fusion
+        would fail with a dependency cycle.
+        """
+        # Only nodes not already lowered are candidates for this backend.
         non_lowered_nodes = set(get_non_lowered_nodes(exported_program.graph))
 
-        for node in exported_program.graph.nodes:
-            if node.op == "call_function":
-                if node not in non_lowered_nodes:
-                    continue
+        control_flow_targets = [
+            torch.ops.higher_order.cond,
+            torch.ops.higher_order.map_impl,
+            torch.ops.higher_order.while_loop,
+            torch.ops.higher_order.scan,
+        ]
+
+        class AotiOperatorSupport(OperatorSupportBase):
+            def is_node_supported(
+                self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
+            ) -> bool:
+                return node.op == "call_function" and node in non_lowered_nodes
+
+        partitioner = CapabilityBasedPartitioner(
+            exported_program.graph_module,
+            AotiOperatorSupport(),
+            allows_single_node_partition=True,
+        )
+
+        partition_tags: Dict[str, DelegationSpec] = {}
+        for partition in partitioner.propose_partitions():
+            tag = f"aoti_{partition.id}"
+            partition_tags[tag] = self.delegation_spec
+            for node in partition.nodes:
                 node.meta["delegation_tag"] = tag
-            # Tag get_attr nodes that are used by control flow operations
-            elif node.op == "get_attr":
-                # Check if any user is a control flow operation
-                for user in node.users:
-                    if is_control_flow(user):
-                        node.meta["delegation_tag"] = tag
-                        break
 
-        partition_tags[tag] = self.delegation_spec
+        # A control-flow op carries its branch GraphModules as get_attr operands;
+        # they must share the op's tag so they land inside the same submodule. A
+        # branch module feeds a single control-flow op, so first match wins.
+        for node in exported_program.graph.nodes:
+            if node.op != "get_attr":
+                continue
+            for user in node.users:
+                if (
+                    user.op == "call_function"
+                    and user.target in control_flow_targets
+                    and "delegation_tag" in user.meta
+                ):
+                    node.meta["delegation_tag"] = user.meta["delegation_tag"]
+                    break
 
         tag_constant_data(exported_program)
         tag_mutated_buffer(exported_program)
 
-        # A constant that still has users feeds only a prior delegate; tagging it
-        # would fail backend lowering's same-tag check (its user keeps the prior
-        # tag). tag_constant_data already claimed the ones this partition uses, so
-        # tag only the genuinely unused constants here.
-        for node in exported_program.graph.nodes:
-            if (
-                node.op == "placeholder"
-                and not node.users
-                and "delegation_tag" not in node.meta
-                and (
-                    is_param(exported_program, node)
-                    or is_buffer(exported_program, node)
-                    or is_lifted_tensor_constant(exported_program, node)
-                )
-            ):
-                node.meta["delegation_tag"] = tag
+        # tag_constant_data only tags constants that have users; tag the
+        # genuinely unused ones too so none are left dangling.
+        if partition_tags:
+            fallback_tag = next(iter(partition_tags))
+            for node in exported_program.graph.nodes:
+                if (
+                    node.op == "placeholder"
+                    and not node.users
+                    and "delegation_tag" not in node.meta
+                    and (
+                        is_param(exported_program, node)
+                        or is_buffer(exported_program, node)
+                        or is_lifted_tensor_constant(exported_program, node)
+                    )
+                ):
+                    node.meta["delegation_tag"] = fallback_tag
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/cuda/tests/test_cuda_partitioner.py b/backends/cuda/tests/test_cuda_partitioner.py
@@ -12,17 +12,18 @@
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.exir.backend.partitioner import PartitionResult
 from executorch.exir.delegate import executorch_call_delegate
-from torch._export.utils import is_buffer
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export import export
+from torch.fx.passes.utils.fuser_utils import validate_partition
 
 
 class TestCudaPartitioner(unittest.TestCase):
     """
     Test CUDA partitioner functionality.
 
-    After CUDA partitioning, there should be exactly one partitioned graph that contains
-    all operators from the input graph. This means all operators should be tagged with
-    the same delegation tag, indicating they will all be executed by the CUDA backend.
+    A fully delegatable graph collapses to a single partition. When a
+    non-delegated node splits the delegatable ops, the partitioner emits one
+    convex partition per island.
     """
 
     def _get_partition_result(
@@ -178,12 +179,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         for node in partition_result.tagged_exported_program.graph.nodes:
             if node.op == "placeholder":
                 # Check if this is a constant (param, buffer, or lifted tensor constant)
-                from torch._export.utils import (
-                    is_buffer,
-                    is_lifted_tensor_constant,
-                    is_param,
-                )
-
                 is_constant = (
                     is_param(partition_result.tagged_exported_program, node)
                     or is_buffer(partition_result.tagged_exported_program, node)
@@ -216,8 +211,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             f"All constant placeholders should be tagged. Found untagged constants: {untagged_constants}",
         )
 
-        # Verify all tagged constants have the expected tag
-        expected_tag = "tag0"
+        # Verify all tagged constants share the (single) partition's tag.
+        self.assertEqual(len(partition_result.partition_tags), 1)
+        expected_tag = next(iter(partition_result.partition_tags))
         for node in constant_placeholders:
             actual_tag = node.meta.get("delegation_tag")
             self.assertEqual(
@@ -320,3 +316,143 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.assertNotIn("delegation_tag", buffer_placeholder.meta)
         self.assertNotIn("delegation_tag", delegate.meta)
         self.assertIn("delegation_tag", aten_node.meta)
+
+    def test_multiple_partitions_for_split_graph(self) -> None:
+        """Ops split by a non-delegated node must land in separate partitions.
+
+        One tag over the disconnected islands would be non-convex and fail fusion.
+        """
+
+        class TwoAddModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                a = x + 1.0
+                return a + 2.0
+
+        exported_program = export(TwoAddModule(), (torch.randn(3, 4),), strict=True)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+
+        add_nodes = [
+            n
+            for n in graph.nodes
+            if n.op == "call_function" and n.target != operator.getitem
+        ]
+        first_add, second_add = add_nodes[0], add_nodes[1]
+
+        # Splice an already-lowered region between the two adds so the second add
+        # depends on the first only through that non-delegated node.
+        graph_module.lowered_module_0 = torch.nn.Module()
+        with graph.inserting_before(second_add):
+            lowered = graph.get_attr("lowered_module_0")
+            delegate = graph.call_function(
+                executorch_call_delegate, (lowered, first_add)
+            )
+            delegate_output = graph.call_function(operator.getitem, (delegate, 0))
+        second_add.replace_input_with(first_add, delegate_output)
+        graph.lint()
+
+        result = CudaPartitioner([]).partition(exported_program)
+
+        # Separated by the delegate, the adds must land in different partitions.
+        self.assertEqual(len(result.partition_tags), 2)
+        self.assertIn("delegation_tag", first_add.meta)
+        self.assertIn("delegation_tag", second_add.meta)
+        self.assertNotEqual(
+            first_add.meta["delegation_tag"], second_add.meta["delegation_tag"]
+        )
+        self.assertNotIn("delegation_tag", delegate.meta)
+        self.assertNotIn("delegation_tag", delegate_output.meta)
+
+        # Each partition must be convex on its own so fusion does not cycle.
+        for tag in result.partition_tags:
+            tagged = [
+                n
+                for n in exported_program.graph.nodes
+                if n.meta.get("delegation_tag") == tag
+            ]
+            self.assertTrue(validate_partition(tagged))
+
+    def test_control_flow_get_attr_shares_op_tag(self) -> None:
+        """A control-flow op's branch get_attrs must share the op's partition tag.
+
+        They are not call_function nodes, so the capability partitioner does not
+        claim them; they must be lowered into the same submodule as the op.
+        """
+
+        class CondModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.cond(x.sum() > 0, torch.sin, torch.cos, (x,))
+
+        exported_program = export(CondModule(), (torch.randn(3, 4),), strict=True)
+        result = CudaPartitioner([]).partition(exported_program)
+
+        cond_node = next(
+            n
+            for n in exported_program.graph.nodes
+            if n.op == "call_function" and n.target is torch.ops.higher_order.cond
+        )
+        branch_get_attrs = [
+            arg
+            for arg in cond_node.args
+            if isinstance(arg, torch.fx.Node) and arg.op == "get_attr"
+        ]
+
+        self.assertEqual(len(branch_get_attrs), 2)
+        self.assertIn(cond_node.meta["delegation_tag"], result.partition_tags)
+        for get_attr in branch_get_attrs:
+            self.assertEqual(
+                get_attr.meta.get("delegation_tag"),
+                cond_node.meta["delegation_tag"],
+            )
+
+    def test_shared_constant_across_partitions(self) -> None:
+        """A constant read by two partitions is claimed, not dropped.
+
+        tag_constant_data assigns it one partition's tag; backend lowering later
+        duplicates it per consumer, so partitioning must not crash or drop it.
+        """
+
+        class SharedWeightModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.register_buffer("w", torch.randn(3, 4))
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return (x + self.w) + self.w
+
+        exported_program = export(
+            SharedWeightModule(), (torch.randn(3, 4),), strict=True
+        )
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+
+        add_nodes = [
+            n
+            for n in graph.nodes
+            if n.op == "call_function" and n.target != operator.getitem
+        ]
+        first_add, second_add = add_nodes[0], add_nodes[1]
+
+        # Split the two adds (both reading w) with an already-lowered region.
+        graph_module.lowered_module_0 = torch.nn.Module()
+        with graph.inserting_before(second_add):
+            lowered = graph.get_attr("lowered_module_0")
+            delegate = graph.call_function(
+                executorch_call_delegate, (lowered, first_add)
+            )
+            delegate_output = graph.call_function(operator.getitem, (delegate, 0))
+        second_add.replace_input_with(first_add, delegate_output)
+        graph.lint()
+
+        result = CudaPartitioner([]).partition(exported_program)
+
+        # Two islands, and the shared buffer is claimed by one of them, not dropped.
+        self.assertEqual(len(result.partition_tags), 2)
+        buffer_placeholder = next(
+            n
+            for n in graph.nodes
+            if n.op == "placeholder" and is_buffer(exported_program, n)
+        )
+        self.assertIn(
+            buffer_placeholder.meta.get("delegation_tag"), result.partition_tags
+        )
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
@@ -390,9 +390,10 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
                         "If the data is too large and it's not preferred to copy, please tag the "
                         "constant node like node.['no_copy'] = True and they won't be copied."
                     )
-                # tag the data node with the same tag as the last user
+                # Pick a deterministic consumer tag so a constant shared across
+                # partitions is assigned reproducibly across runs.
                 if len(user_tags) > 0:
-                    node.meta["delegation_tag"] = user_tags.pop()
+                    node.meta["delegation_tag"] = min(user_tags)
 
 
 def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
@@ -429,9 +430,10 @@ def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
                     "If the data is too large and it's not preferred to copy, please tag the "
                     "constant node like node.['no_copy'] = True and they won't be copied."
                 )
-            # tag the data node with the same tag as the last user
+            # Pick a deterministic consumer tag so a buffer shared across
+            # partitions is assigned reproducibly across runs.
             if len(user_tags) > 0:
-                node.meta["delegation_tag"] = user_tags.pop()
+                node.meta["delegation_tag"] = min(user_tags)
 
 
 def is_shape_dynamic(node: torch.fx.Node) -> bool: