up

metascroy · metascroy · commit dce0bec46acf · 2026-06-17T10:49:27.000-07:00
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -79,6 +79,7 @@ jobs:
           backends/mlx/test/test_pattern_utils.py \
           backends/mlx/test/test_partitioner.py \
           backends/mlx/test/test_serialization_dedup.py \
+          backends/mlx/test/test_slot_recycling.py \
           examples/models/gemma4_31b/quant/tests/test_pack_mlx.py \
           examples/models/gemma4_31b/tests/test_mlx_pipeline.py \
           -v
diff --git a/backends/mlx/builder/program_builder.py b/backends/mlx/builder/program_builder.py
@@ -242,6 +242,13 @@ def make_tmp_value_slot(self) -> Tuple[str, Slot]:
         """Create a temporary value (SymInt) slot."""
         return self.slot_manager.make_tmp_value_slot()
 
+    def tmp_scope(self):
+        """Context manager scoping temporary slot ids for reuse.
+
+        See :meth:`SlotManager.tmp_scope`.
+        """
+        return self.slot_manager.tmp_scope()
+
     def make_or_get_constant(self, name: str, tensor: torch.Tensor) -> Slot:
         """
         Creates an extra constant outside of the ExportedProgram state_dict.
@@ -529,7 +536,8 @@ def _process_nodes(self) -> None:  # noqa C901
 
             if self.node_info[n].handler is not None:
                 handler = self.node_info[n].handler
-                handler(self, n)
+                with self.tmp_scope():
+                    handler(self, n)
                 self._mark_supported(n, handler=handler)
                 continue
 
@@ -558,7 +566,8 @@ def _process_nodes(self) -> None:  # noqa C901
                 continue
 
             try:
-                handler(self, n)
+                with self.tmp_scope():
+                    handler(self, n)
                 self._mark_supported(n, handler=handler)
             except Exception as e:
                 trace_str = traceback.format_exc()
@@ -688,14 +697,20 @@ def _collect_used_slots(
                     # Inputs, outputs, mutable buffers - always include
                     used_slots.add(s)
 
+        # Count distinct physical slots. Slots that share (id_space, idx) are the
+        # same slot reused across disjoint lifetimes (delete-as-you-go reclaim /
+        # tmp_scope) and are coalesced to a single global id below, so they must
+        # be counted once. (For non-tensors, SymInt/SymBool share the vid pool.)
         num_tensors: Dict[IdSpace, int] = defaultdict(int)
         num_values: Dict[IdSpace, int] = defaultdict(int)
-        seen: Set[Slot] = set()
+        seen_keys: Set[Tuple[bool, IdSpace, int]] = set()
         for s in used_slots:
-            if s in seen:
+            is_tensor = s.id_type == IdType.Tensor
+            key = (is_tensor, s.id_space, s.idx)
+            if key in seen_keys:
                 continue
-            seen.add(s)
-            if s.id_type == IdType.Tensor:
+            seen_keys.add(key)
+            if is_tensor:
                 num_tensors[s.id_space] += 1
             else:
                 num_values[s.id_space] += 1
@@ -719,19 +734,28 @@ def _create_slot_mappings(
             IdSpace.Temp: 4,
         }
 
+        # Coalesce slots that share (id_space, idx) to a single global id. Such
+        # slots are the same physical slot reused across disjoint lifetimes
+        # (delete-as-you-go reclaim / tmp_scope), so they must map to the same
+        # global Tid/Vid. Sorting by (id_space, idx) keeps per-space id ranges
+        # contiguous, matching the counts from _collect_used_slots.
+        def _coalesce(slots: List[Slot]) -> Dict[Slot, int]:
+            mapping: Dict[Slot, int] = {}
+            key_to_global: Dict[Tuple[IdSpace, int], int] = {}
+            for s in sorted(slots, key=lambda s: (id_space_order[s.id_space], s.idx)):
+                key = (s.id_space, s.idx)
+                gid = key_to_global.get(key)
+                if gid is None:
+                    gid = len(key_to_global)
+                    key_to_global[key] = gid
+                mapping[s] = gid
+            return mapping
+
         # Create Tid mapping
-        slot_to_tid = sorted(
-            [s for s in used_slots if s.id_type == IdType.Tensor],
-            key=lambda s: (id_space_order[s.id_space], s.idx),
-        )
-        slot_to_tid = {s: idx for idx, s in enumerate(slot_to_tid)}
+        slot_to_tid = _coalesce([s for s in used_slots if s.id_type == IdType.Tensor])
 
         # Create Vid mapping
-        slot_to_vid = sorted(
-            [s for s in used_slots if s.id_type != IdType.Tensor],
-            key=lambda s: (id_space_order[s.id_space], s.idx),
-        )
-        slot_to_vid = {s: idx for idx, s in enumerate(slot_to_vid)}
+        slot_to_vid = _coalesce([s for s in used_slots if s.id_type != IdType.Tensor])
 
         # Remap all Tid/Vid values in instructions to use global indices
         if hasattr(self, "_tid_slot_map"):
diff --git a/backends/mlx/builder/slot_manager.py b/backends/mlx/builder/slot_manager.py
@@ -8,9 +8,10 @@
 
 import uuid
 from collections import defaultdict
+from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import auto, Enum
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Iterator, List, Optional, Tuple, Union
 
 import torch
 from torch.fx.node import Node
@@ -73,6 +74,54 @@ def __init__(self):
         self.tid_managers: Dict[IdSpace, IdManager] = defaultdict(IdManager)
         self.vid_managers: Dict[IdSpace, IdManager] = defaultdict(IdManager)
         self.name_to_slot: Dict[str, Slot] = {}
+        # Stack of active temp-slot scopes (see ``tmp_scope``). Temp tids/vids
+        # allocated via make_tmp_slot()/make_tmp_value_slot() are registered on
+        # the innermost scope and their ids returned for reuse on scope exit.
+        self._tmp_scopes: List[List[Slot]] = []
+
+    @contextmanager
+    def tmp_scope(self) -> Iterator[None]:
+        """Scope temporary slot allocations so their ids can be reused.
+
+        Temp tids/vids allocated via :meth:`make_tmp_slot` /
+        :meth:`make_tmp_value_slot` inside this context are returned to their
+        id pools when the context exits, so later allocations (temp or node)
+        can reuse them. Allocating a temp slot outside any ``tmp_scope`` raises
+        ``RuntimeError``.
+
+        Scopes may be nested; each allocation is tied to the innermost scope.
+        The Slot objects stay in ``name_to_slot`` (mirroring node-slot reclaim
+        via ``return_id``) so serialization still sees every distinct slot.
+        """
+        self._tmp_scopes.append([])
+        try:
+            yield
+        finally:
+            scope = self._tmp_scopes.pop()
+            for slot in scope:
+                if slot.id_type == IdType.Tensor:
+                    self.tid_managers[slot.id_space].return_id(slot.idx)
+                else:
+                    self.vid_managers[slot.id_space].return_id(slot.idx)
+
+    def _new_tmp_slot(self, id_type: IdType, prefix: str) -> Tuple[str, Slot]:
+        if not self._tmp_scopes:
+            raise RuntimeError(
+                f"{prefix}() must be called within a SlotManager.tmp_scope() "
+                "context so temporary ids can be reclaimed and reused."
+            )
+        name = f"{prefix}_{uuid.uuid4().hex}"
+        id_space = IdSpace.Temp
+        manager = (
+            self.tid_managers[id_space]
+            if id_type == IdType.Tensor
+            else self.vid_managers[id_space]
+        )
+        idx = manager.get_id()
+        slot = Slot(id_type=id_type, id_space=id_space, idx=idx)
+        self.name_to_slot[name] = slot
+        self._tmp_scopes[-1].append(slot)
+        return name, slot
 
     def set_slot(self, node_or_name: Union[Node, str], slot: Slot):
         if isinstance(node_or_name, Node):
@@ -129,23 +178,11 @@ def make_constant_slot(self, name: str) -> Slot:
         return slot
 
     def make_tmp_slot(self) -> Tuple[str, Slot]:
-        name = f"tmp_{uuid.uuid4().hex}"
-        id_space = IdSpace.Temp
-        manager = self.tid_managers[id_space]
-        idx = manager.get_id()
-        slot = Slot(id_type=IdType.Tensor, id_space=id_space, idx=idx)
-        self.name_to_slot[name] = slot
-        return name, slot
+        return self._new_tmp_slot(IdType.Tensor, "tmp")
 
     def make_tmp_value_slot(self) -> Tuple[str, Slot]:
         """Create a temporary SymInt slot and register it."""
-        name = f"tmp_val_{uuid.uuid4().hex}"
-        id_space = IdSpace.Temp
-        manager = self.vid_managers[id_space]
-        idx = manager.get_id()
-        slot = Slot(id_type=IdType.SymInt, id_space=id_space, idx=idx)
-        self.name_to_slot[name] = slot
-        return name, slot
+        return self._new_tmp_slot(IdType.SymInt, "tmp_val")
 
     def make_or_get_slots(
         self, node: Node, id_space: IdSpace = IdSpace.Temp
diff --git a/backends/mlx/test/test_slot_recycling.py b/backends/mlx/test/test_slot_recycling.py
@@ -0,0 +1,152 @@
+#
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""Regression tests for temp-slot recycling in the MLX program builder.
+
+Two invariants are guarded here:
+
+1. ``SlotManager.tmp_scope`` reclaims temp tids/vids on exit (and creating a
+   temp slot outside a scope raises), so local ids are reused.
+2. The serialized graph coalesces slots that share ``(id_space, idx)`` to a
+   single global Tid/Vid, so ``num_temp_tensors`` / ``num_values`` reflect that
+   reuse. Without this, recycled slots each get their own runtime slot (which is
+   never freed until end-of-execution ``reset()``), inflating peak memory. This
+   is easy to silently reintroduce (e.g. enumerating distinct Slot objects), so
+   it is asserted directly.
+
+Run::
+
+    python -m unittest executorch.backends.mlx.builder.test_slot_recycling
+"""
+
+import unittest
+
+import torch
+import torch.nn as nn
+
+from executorch.backends.mlx.builder.program_builder import MLXProgramBuilder
+from executorch.backends.mlx.builder.slot_manager import (
+    IdSpace,
+    IdType,
+    Slot,
+    SlotManager,
+)
+from executorch.backends.mlx.serialization.mlx_graph_schema import Tid, Vid
+
+
+def _trivial_ep():
+    """Minimal ExportedProgram just to satisfy ``MLXProgramBuilder.__init__``.
+
+    The graph is never processed; the coalescing tests drive the builder's slot
+    bookkeeping directly.
+    """
+
+    class _Identity(nn.Module):
+        def forward(self, x):
+            return x + 1
+
+    return torch.export.export(_Identity(), (torch.zeros(2),))
+
+
+class TmpScopeTest(unittest.TestCase):
+    def test_make_tmp_requires_scope(self):
+        sm = SlotManager()
+        with self.assertRaises(RuntimeError):
+            sm.make_tmp_slot()
+        with self.assertRaises(RuntimeError):
+            sm.make_tmp_value_slot()
+
+    def test_tmp_ids_reclaimed_and_reused(self):
+        sm = SlotManager()
+        with sm.tmp_scope():
+            _, a = sm.make_tmp_slot()
+            _, b = sm.make_tmp_slot()
+            self.assertNotEqual(a.idx, b.idx)  # live simultaneously
+            self.assertTrue(sm.is_alive(a))
+        # Reclaimed on exit.
+        self.assertFalse(sm.is_alive(a))
+        self.assertFalse(sm.is_alive(b))
+        # Next scope reuses a freed idx.
+        with sm.tmp_scope():
+            _, c = sm.make_tmp_slot()
+            self.assertIn(c.idx, (a.idx, b.idx))
+
+    def test_value_slots_reclaimed(self):
+        sm = SlotManager()
+        with sm.tmp_scope():
+            _, v = sm.make_tmp_value_slot()
+            self.assertTrue(sm.is_alive(v))
+        self.assertFalse(sm.is_alive(v))
+
+    def test_nested_scopes(self):
+        sm = SlotManager()
+        with sm.tmp_scope():
+            _, outer = sm.make_tmp_slot()
+            with sm.tmp_scope():
+                _, inner = sm.make_tmp_slot()
+            # Inner scope reclaimed its slot; outer slot is still live.
+            self.assertFalse(sm.is_alive(inner))
+            self.assertTrue(sm.is_alive(outer))
+
+
+class SlotCoalescingTest(unittest.TestCase):
+    """Slots sharing ``(id_space, idx)`` must map to one global Tid/Vid."""
+
+    def _builder_with_slots(self, tensor_slots, value_slots):
+        P = MLXProgramBuilder(_trivial_ep())
+        # Start from a clean slot table so the trivial graph's own slots don't
+        # interfere, then register synthetic slots as if emitted by handlers.
+        P.slot_manager = SlotManager()
+        P._tid_slot_map = []
+        P._vid_slot_map = []
+        for i, s in enumerate(tensor_slots):
+            P.slot_manager.name_to_slot[f"t{i}"] = s
+            P._tid_slot_map.append((Tid(idx=None), s))
+        for i, s in enumerate(value_slots):
+            P.slot_manager.name_to_slot[f"v{i}"] = s
+            P._vid_slot_map.append((Vid(idx=None), s))
+        return P
+
+    def test_reused_tids_coalesce(self):
+        a = Slot(IdType.Tensor, IdSpace.Temp, 0)
+        b = Slot(IdType.Tensor, IdSpace.Temp, 0)  # reused idx 0 (disjoint life)
+        c = Slot(IdType.Tensor, IdSpace.Temp, 1)
+        k = Slot(IdType.Tensor, IdSpace.Constant, 0)
+        P = self._builder_with_slots([a, b, c, k], [])
+
+        used, num_tensors, _ = P._collect_used_slots()
+        slot_to_tid, _ = P._create_slot_mappings(used)
+
+        self.assertEqual(slot_to_tid[a], slot_to_tid[b], "reused idx must coalesce")
+        self.assertNotEqual(slot_to_tid[a], slot_to_tid[c], "distinct idx stays distinct")
+        # Counts reflect distinct (id_space, idx), not distinct Slot objects.
+        self.assertEqual(num_tensors[IdSpace.Temp], 2)
+        self.assertEqual(sum(num_tensors.values()), len(set(slot_to_tid.values())))
+        # Emitted Tid references collapse in the serialized graph too.
+        ref = {id(s): t for t, s in P._tid_slot_map}
+        self.assertEqual(ref[id(a)].idx, ref[id(b)].idx)
+        self.assertNotEqual(ref[id(a)].idx, ref[id(c)].idx)
+
+    def test_reused_vids_coalesce(self):
+        # SymInt and SymBool share the vid pool, so equal idx must coalesce.
+        v0 = Slot(IdType.SymInt, IdSpace.Temp, 0)
+        v0b = Slot(IdType.SymBool, IdSpace.Temp, 0)
+        v1 = Slot(IdType.SymInt, IdSpace.Temp, 1)
+        P = self._builder_with_slots([], [v0, v0b, v1])
+
+        used, _, num_values = P._collect_used_slots()
+        _, slot_to_vid = P._create_slot_mappings(used)
+
+        self.assertEqual(slot_to_vid[v0], slot_to_vid[v0b], "shared vid idx must coalesce")
+        self.assertNotEqual(slot_to_vid[v0], slot_to_vid[v1])
+        self.assertEqual(num_values[IdSpace.Temp], 2)
+        self.assertEqual(sum(num_values.values()), len(set(slot_to_vid.values())))
+
+
+if __name__ == "__main__":
+    unittest.main()