From 1fe3a13f8476f333cb825e0b4a7f436a27684f36 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 7 May 2026 21:08:15 +1000
Subject: [PATCH 01/23] model_management: disable non-dynamic smart memory

Disable smart memory outright for non dynamic models.

This is a minor step towards deprecation of --disable-dynamic-vram
and the legacy ModelPatcher.

This is needed for estimate-free model development, where new models
can opt-out of supplying a memory estimate and not have to worry
about hard VRAM allocations due to legacy non-dynamic model patchers

This is also a general stability increase for a lot of stray use cases
where estimates may still be off and going forward we are not going
to accurately maintain such estimates.
---
 comfy/model_management.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 21738a4c7816..ebef03ceb62a 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -674,10 +674,10 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
         i = x[-1]
         memory_to_free = 1e32
         pins_to_free = 1e32
-        if not DISABLE_SMART_MEMORY or device is None:
+        if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None):
             memory_to_free = 0 if device is None else memory_required - get_free_memory(device)
             pins_to_free = pins_required - get_free_ram()
-            if current_loaded_models[i].model.is_dynamic() and for_dynamic:
+            if for_dynamic:
                 #don't actually unload dynamic models for the sake of other dynamic models
                 #as that works on-demand.
                 memory_required -= current_loaded_models[i].model.loaded_size()

From 157965a1c99792e6250e6027ba2045efdd148528 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Mon, 4 May 2026 12:32:12 +1000
Subject: [PATCH 02/23] pinned_memory: implement with aimdo growable buffer

Use a single growable buffer so we can do threaded pre-warming on
pinned memory.
---
 comfy/model_patcher.py | 34 +++++++++++++++++++++-------------
 comfy/pinned_memory.py | 30 ++++++++++--------------------
 2 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 2ea14bc2c8a4..dc5f0e577ec9 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -35,6 +35,7 @@
 import comfy.ops
 import comfy.patcher_extension
 import comfy.utils
+import comfy_aimdo.host_buffer
 from comfy.comfy_types import UnetWrapperFunction
 from comfy.quant_ops import QuantizedTensor
 from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
@@ -1543,6 +1544,10 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up
         super().__init__(model, load_device, offload_device, size, weight_inplace_update)
         if not hasattr(self.model, "dynamic_vbars"):
             self.model.dynamic_vbars = {}
+        if not hasattr(self.model, "dynamic_pins"):
+            self.model.dynamic_pins = {}
+        if self.load_device not in self.model.dynamic_pins:
+            self.model.dynamic_pins[self.load_device] = {"hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), "stack": [], "failed": False}
         self.non_dynamic_delegate_model = None
         assert load_device is not None
 
@@ -1604,6 +1609,8 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False
             self.unpatch_hooks()
 
             vbar = self._vbar_get(create=True)
+            pin_state = self.model.dynamic_pins[self.load_device]
+            pin_state["failed"] = False
             if vbar is not None:
                 vbar.prioritize()
 
@@ -1655,8 +1662,8 @@ def force_load_param(self, param_key, device_to):
 
                 if hasattr(m, "comfy_cast_weights"):
                     m.comfy_cast_weights = True
-                    m.pin_failed = False
                     m.seed_key = n
+                    m._pin_state = pin_state
                     set_dirty(m, dirty)
 
                     force_load, v_weight_size = setup_param(self, m, n, "weight")
@@ -1734,20 +1741,21 @@ def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=Fals
         return freed
 
     def pinned_memory_size(self):
-        total = 0
-        loading = self._load_list(for_dynamic=True)
-        for x in loading:
-            _, _, _, _, m, _ = x
-            pin = comfy.pinned_memory.get_pin(m)
-            if pin is not None:
-                total += pin.numel() * pin.element_size()
-        return total
+        return self.model.dynamic_pins[self.load_device]["hostbuf"].size
 
     def partially_unload_ram(self, ram_to_unload):
-        loading = self._load_list(for_dynamic=True, default_device=self.offload_device)
-        for x in loading:
-            *_, m, _ = x
-            ram_to_unload -= comfy.pinned_memory.unpin_memory(m)
+        pin_state = self.model.dynamic_pins[self.load_device]
+        hostbuf = pin_state["hostbuf"]
+        stack = self.model.dynamic_pins[self.load_device]["stack"]
+        while len(stack) > 0:
+            module, offset = stack.pop()
+            size = module._pin.numel() * module._pin.element_size()
+            del module._pin
+            hostbuf.truncate(offset)
+            comfy.model_management.TOTAL_PINNED_MEMORY -= size
+            if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
+                comfy.model_management.TOTAL_PINNED_MEMORY = 0
+            ram_to_unload -= size
             if ram_to_unload <= 0:
                 return
 
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
index 6d3ba367a798..3638066c8825 100644
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -9,35 +9,25 @@ def get_pin(module):
     return getattr(module, "_pin", None)
 
 def pin_memory(module):
-    if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None:
+    pin_state = module._pin_state
+    if pin_state["failed"] or args.disable_pinned_memory or get_pin(module) is not None:
         return
 
+    hostbuf = pin_state["hostbuf"]
     size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
-
+    offset = hostbuf.size
     if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY:
-        module.pin_failed = True
+        pin_state["failed"] = True
         return False
 
     try:
-        hostbuf = comfy_aimdo.host_buffer.HostBuffer(size)
+        hostbuf.extend(size=size)
     except RuntimeError:
-        module.pin_failed = True
+        pin_state["failed"] = True
         return False
 
-    module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)
-    module._pin_hostbuf = hostbuf
+    module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
+    module._pin.untyped_storage()._comfy_hostbuf = hostbuf
+    pin_state["stack"].append((module, offset))
     comfy.model_management.TOTAL_PINNED_MEMORY += size
     return True
-
-def unpin_memory(module):
-    if get_pin(module) is None:
-        return 0
-    size = module._pin.numel() * module._pin.element_size()
-
-    comfy.model_management.TOTAL_PINNED_MEMORY -= size
-    if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
-        comfy.model_management.TOTAL_PINNED_MEMORY = 0
-
-    del module._pin
-    del module._pin_hostbuf
-    return size

From b66b6420681a83f5bd247dec42d95f96113503d7 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Mon, 4 May 2026 12:47:28 +1000
Subject: [PATCH 03/23] mm: use aimdo to do transfer from disk to pin

Aimdo implements a faster threaded loader.
---
 comfy/memory_management.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/comfy/memory_management.py b/comfy/memory_management.py
index 48e3c11da9b6..4a628b05c18f 100644
--- a/comfy/memory_management.py
+++ b/comfy/memory_management.py
@@ -48,6 +48,12 @@ def read_tensor_file_slice_into(tensor, destination):
     if info.size == 0:
         return True
 
+    hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None)
+    if hostbuf is not None:
+        hostbuf.read_file_slice(file_obj, info.offset, info.size,
+                                offset=destination.data_ptr() - hostbuf.get_raw_address())
+        return True
+
     buf_type = ctypes.c_ubyte * info.size
     view = memoryview(buf_type.from_address(destination.data_ptr()))
 

From 8070cb77809145e7cf24b94eeb7f55710cdfcd17 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 7 May 2026 14:04:48 +1000
Subject: [PATCH 04/23] Add stream host pin buffer for AIMDO casts

Introduce per-offload-stream HostBuffer reuse for pinned staging,
include it in cast buffer reset synchronization.

Defer actual casts that go via this pin path to a separate pass
such that the buffer can be allocated monolithically (to avoid
cudaHostRegister thrash).
---
 comfy/model_management.py | 18 ++++++++++--
 comfy/ops.py              | 58 +++++++++++++++++++++++++++++++--------
 2 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index ebef03ceb62a..facdd0873d7a 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -31,6 +31,7 @@
 import comfy.memory_management
 import comfy.utils
 import comfy.quant_ops
+import comfy_aimdo.host_buffer
 import comfy_aimdo.vram_buffer
 
 class VRAMState(Enum):
@@ -1180,8 +1181,10 @@ def current_stream(device):
 LARGEST_CASTED_WEIGHT = (None, 0)
 STREAM_AIMDO_CAST_BUFFERS = {}
 LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
+STREAM_PIN_BUFFERS = {}
 
 DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3
+DEFAULT_PIN_BUFFER_PRIME_SIZE = 1024 ** 2
 
 def get_cast_buffer(offload_stream, device, size, ref):
     global LARGEST_CASTED_WEIGHT
@@ -1220,21 +1223,32 @@ def get_aimdo_cast_buffer(offload_stream, device):
     if cast_buffer is None:
         cast_buffer = comfy_aimdo.vram_buffer.VRAMBuffer(DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE, device.index)
         STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer
-
     return cast_buffer
+
+def get_pin_buffer(offload_stream):
+    pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None)
+    if pin_buffer is None:
+        # A small non-zero default primes HostBuffer's larger virtual reservation.
+        pin_buffer = comfy_aimdo.host_buffer.HostBuffer(DEFAULT_PIN_BUFFER_PRIME_SIZE)
+        STREAM_PIN_BUFFERS[offload_stream] = pin_buffer
+    elif offload_stream is not None:
+        offload_stream.synchronize()
+    return pin_buffer
+
 def reset_cast_buffers():
     global LARGEST_CASTED_WEIGHT
     global LARGEST_AIMDO_CASTED_WEIGHT
 
     LARGEST_CASTED_WEIGHT = (None, 0)
     LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
-    for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS):
+    for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS):
         if offload_stream is not None:
             offload_stream.synchronize()
     synchronize()
 
     STREAM_CAST_BUFFERS.clear()
     STREAM_AIMDO_CAST_BUFFERS.clear()
+    STREAM_PIN_BUFFERS.clear()
     soft_empty_cache()
 
 def get_offload_stream(device):
diff --git a/comfy/ops.py b/comfy/ops.py
index 77ad1d5276da..3d196f43877a 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -75,6 +75,8 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs):
 
 cast_to = comfy.model_management.cast_to #TODO: remove once no more references
 
+STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024
+
 def cast_to_input(weight, input, non_blocking=False, copy=True):
     return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
 
@@ -91,6 +93,9 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
     offload_stream = None
     cast_buffer = None
     cast_buffer_offset = 0
+    stream_pin_hostbuf = None
+    stream_pin_offset = 0
+    stream_pin_queue = []
 
     def ensure_offload_stream(module, required_size, check_largest):
         nonlocal offload_stream
@@ -124,6 +129,20 @@ def get_cast_buffer(buffer_size):
         cast_buffer_offset += buffer_size
         return buffer
 
+    def get_stream_pin_buffer_offset(buffer_size):
+        nonlocal stream_pin_hostbuf
+        nonlocal stream_pin_offset
+
+        if buffer_size == 0 or offload_stream is None:
+            return None
+
+        if stream_pin_hostbuf is None:
+            stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream)
+
+        offset = stream_pin_offset
+        stream_pin_offset += buffer_size
+        return offset
+
     for s in comfy_modules:
         signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
         resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
@@ -162,17 +181,21 @@ def get_cast_buffer(buffer_size):
         if xfer_dest is None:
             xfer_dest = get_cast_buffer(dest_size)
 
-        if signature is None and pin is None:
-            comfy.pinned_memory.pin_memory(s)
-            pin = comfy.pinned_memory.get_pin(s)
-        else:
-            pin = None
-
-        if pin is not None:
-            comfy.model_management.cast_to_gathered(xfer_source, pin)
-            xfer_source = [ pin ]
-        #send it over
-        comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+        if pin is None:
+            if signature is None:
+                comfy.pinned_memory.pin_memory(s)
+                pin = comfy.pinned_memory.get_pin(s)
+                if pin is not None:
+                    comfy.model_management.cast_to_gathered(xfer_source, pin)
+                    xfer_source = [ pin ]
+            if pin is None:
+                pin_offset = get_stream_pin_buffer_offset(dest_size)
+                if pin_offset is not None:
+                    stream_pin_queue.append((xfer_source, pin_offset, dest_size, xfer_dest))
+                    xfer_source = None
+
+        if xfer_source is not None:
+            comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
 
         for param_key in ("weight", "bias"):
             lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
@@ -186,6 +209,19 @@ def get_cast_buffer(buffer_size):
         prefetch["needs_cast"] = needs_cast
         s._prefetch = prefetch
 
+    if stream_pin_offset > 0:
+        stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size)
+        if stream_pin_hostbuf_size < stream_pin_offset:
+            stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM
+            stream_pin_hostbuf.extend(size=stream_pin_hostbuf_size, reallocate=True)
+            stream_pin_hostbuf._comfy_stream_pin_size = stream_pin_hostbuf_size
+        stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset)
+        stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf
+        for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue:
+            pin = stream_pin_tensor[pin_offset:pin_offset + pin_size]
+            comfy.model_management.cast_to_gathered(xfer_source, pin)
+            comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+
     return offload_stream
 
 

From 17955235b2d95e9e8aa6f9719bcc1a29d8976ceb Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 7 May 2026 18:33:02 +1000
Subject: [PATCH 05/23] remove old pin path

---
 comfy/model_management.py | 74 +++++++--------------------------------
 comfy/model_patcher.py    |  3 --
 comfy/utils.py            |  2 --
 comfy/windows.py          | 52 ---------------------------
 4 files changed, 13 insertions(+), 118 deletions(-)
 delete mode 100644 comfy/windows.py

diff --git a/comfy/model_management.py b/comfy/model_management.py
index facdd0873d7a..4b96d1492e4b 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -496,6 +496,8 @@ def get_torch_device_name(device):
 
 current_loaded_models = []
 
+DIRTY_MMAPS = set()
+
 def module_size(module):
     module_mem = 0
     sd = module.state_dict()
@@ -504,27 +506,11 @@ def module_size(module):
         module_mem += t.nbytes
     return module_mem
 
-def module_mmap_residency(module, free=False):
-    mmap_touched_mem = 0
-    module_mem = 0
-    bounced_mmaps = set()
-    sd = module.state_dict()
-    for k in sd:
-        t = sd[k]
-        module_mem += t.nbytes
-        storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
-        if not getattr(storage, "_comfy_tensor_mmap_touched", False):
-            continue
-        mmap_touched_mem += t.nbytes
-        if not free:
-            continue
-        storage._comfy_tensor_mmap_touched = False
-        mmap_obj = storage._comfy_tensor_mmap_refs[0]
-        if mmap_obj in bounced_mmaps:
-            continue
-        mmap_obj.bounce()
-        bounced_mmaps.add(mmap_obj)
-    return mmap_touched_mem, module_mem
+def mark_mmap_dirty(storage):
+    mmap_refs = getattr(storage, "_comfy_tensor_mmap_refs", None)
+    if mmap_refs is not None:
+        DIRTY_MMAPS.add(mmap_refs[0])
+
 
 class LoadedModel:
     def __init__(self, model):
@@ -554,9 +540,6 @@ def model(self):
     def model_memory(self):
         return self.model.model_size()
 
-    def model_mmap_residency(self, free=False):
-        return self.model.model_mmap_residency(free=free)
-
     def model_loaded_memory(self):
         return self.model.loaded_size()
 
@@ -636,15 +619,9 @@ def offloaded_memory(loaded_models, device):
 
 EXTRA_RESERVED_VRAM = 400 * 1024 * 1024
 if WINDOWS:
-    import comfy.windows
     EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue
     if total_vram > (15 * 1024):  # more extra reserved vram on 16GB+ cards
         EXTRA_RESERVED_VRAM += 100 * 1024 * 1024
-    def get_free_ram():
-        return comfy.windows.get_free_ram()
-else:
-    def get_free_ram():
-        return psutil.virtual_memory().available
 
 if args.reserve_vram is not None:
     EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
@@ -658,7 +635,6 @@ def minimum_inference_memory():
 
 def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins_required=0, ram_required=0):
     cleanup_models_gc()
-    comfy.memory_management.extra_ram_release(max(pins_required, ram_required))
     unloaded_model = []
     can_unload = []
     unloaded_models = []
@@ -674,10 +650,8 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
     for x in can_unload_sorted:
         i = x[-1]
         memory_to_free = 1e32
-        pins_to_free = 1e32
         if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None):
             memory_to_free = 0 if device is None else memory_required - get_free_memory(device)
-            pins_to_free = pins_required - get_free_ram()
             if for_dynamic:
                 #don't actually unload dynamic models for the sake of other dynamic models
                 #as that works on-demand.
@@ -686,18 +660,6 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
         if memory_to_free > 0 and current_loaded_models[i].model_unload(memory_to_free):
             logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
             unloaded_model.append(i)
-        if pins_to_free > 0:
-            logging.debug(f"PIN Unloading {current_loaded_models[i].model.model.__class__.__name__}")
-            current_loaded_models[i].model.partially_unload_ram(pins_to_free)
-
-    for x in can_unload_sorted:
-        i = x[-1]
-        ram_to_free = ram_required - psutil.virtual_memory().available
-        if ram_to_free <= 0 and i not in unloaded_model:
-            continue
-        resident_memory, _ = current_loaded_models[i].model_mmap_residency(free=True)
-        if resident_memory > 0:
-            logging.debug(f"RAM Unloading {current_loaded_models[i].model.model.__class__.__name__}")
 
     for i in sorted(unloaded_model, reverse=True):
         unloaded_models.append(current_loaded_models.pop(i))
@@ -763,29 +725,16 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
             model_to_unload.model.detach(unpatch_all=False)
             model_to_unload.model_finalizer.detach()
 
-
     total_memory_required = {}
-    total_pins_required = {}
-    total_ram_required = {}
     for loaded_model in models_to_load:
         device = loaded_model.device
         total_memory_required[device] = total_memory_required.get(device, 0) + loaded_model.model_memory_required(device)
-        resident_memory, model_memory = loaded_model.model_mmap_residency()
-        pinned_memory = loaded_model.model.pinned_memory_size()
-        #FIXME: This can over-free the pins as it budgets to pin the entire model. We should
-        #make this JIT to keep as much pinned as possible.
-        pins_required = model_memory - pinned_memory
-        ram_required = model_memory - resident_memory
-        total_pins_required[device] = total_pins_required.get(device, 0) + pins_required
-        total_ram_required[device] = total_ram_required.get(device, 0) + ram_required
 
     for device in total_memory_required:
         if device != torch.device("cpu"):
             free_memory(total_memory_required[device] * 1.1 + extra_mem,
                         device,
-                        for_dynamic=free_for_dynamic,
-                        pins_required=total_pins_required[device],
-                        ram_required=total_ram_required[device])
+                        for_dynamic=free_for_dynamic)
 
     for device in total_memory_required:
         if device != torch.device("cpu"):
@@ -1246,6 +1195,10 @@ def reset_cast_buffers():
             offload_stream.synchronize()
     synchronize()
 
+    for mmap_obj in DIRTY_MMAPS:
+        mmap_obj.bounce()
+
+    DIRTY_MMAPS.clear()
     STREAM_CAST_BUFFERS.clear()
     STREAM_AIMDO_CAST_BUFFERS.clear()
     STREAM_PIN_BUFFERS.clear()
@@ -1310,8 +1263,7 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
             if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
                 continue
             storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
-            if hasattr(storage, "_comfy_tensor_mmap_touched"):
-                storage._comfy_tensor_mmap_touched = True
+            mark_mmap_dirty(storage)
             dest_view.copy_(tensor, non_blocking=non_blocking)
 
 
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index dc5f0e577ec9..43712c7a0359 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -342,9 +342,6 @@ def model_size(self):
         self.size = comfy.model_management.module_size(self.model)
         return self.size
 
-    def model_mmap_residency(self, free=False):
-        return comfy.model_management.module_mmap_residency(self.model, free=free)
-
     def loaded_size(self):
         return self.model.model_loaded_weight_memory
 
diff --git a/comfy/utils.py b/comfy/utils.py
index b759720274f1..fabe18b510cd 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -113,7 +113,6 @@ def load_safetensors(ckpt):
                         "_comfy_tensor_file_slice",
                         comfy.memory_management.TensorFileSlice(f, threading.get_ident(), data_base_offset + start, end - start))
                 setattr(storage, "_comfy_tensor_mmap_refs", (model_mmap, mv))
-                setattr(storage, "_comfy_tensor_mmap_touched", False)
                 sd[name] = tensor
 
     return sd, header.get("__metadata__", {}),
@@ -1445,4 +1444,3 @@ def deepcopy_list_dict(obj, memo=None):
 
     memo[obj_id] = res
     return res
-
diff --git a/comfy/windows.py b/comfy/windows.py
deleted file mode 100644
index 213dc481d937..000000000000
--- a/comfy/windows.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import ctypes
-import logging
-import psutil
-from ctypes import wintypes
-
-import comfy_aimdo.control
-
-psapi = ctypes.WinDLL("psapi")
-kernel32 = ctypes.WinDLL("kernel32")
-
-class PERFORMANCE_INFORMATION(ctypes.Structure):
-    _fields_ = [
-        ("cb", wintypes.DWORD),
-        ("CommitTotal", ctypes.c_size_t),
-        ("CommitLimit", ctypes.c_size_t),
-        ("CommitPeak", ctypes.c_size_t),
-        ("PhysicalTotal", ctypes.c_size_t),
-        ("PhysicalAvailable", ctypes.c_size_t),
-        ("SystemCache", ctypes.c_size_t),
-        ("KernelTotal", ctypes.c_size_t),
-        ("KernelPaged", ctypes.c_size_t),
-        ("KernelNonpaged", ctypes.c_size_t),
-        ("PageSize", ctypes.c_size_t),
-        ("HandleCount", wintypes.DWORD),
-        ("ProcessCount", wintypes.DWORD),
-        ("ThreadCount", wintypes.DWORD),
-    ]
-
-def get_free_ram():
-    #Windows is way too conservative and chalks recently used uncommitted model RAM
-    #as "in-use". So, calculate free RAM for the sake of general use as the greater of:
-    #
-    #1: What psutil says
-    #2: Total Memory - (Committed Memory - VRAM in use)
-    #
-    #We have to subtract VRAM in use from the comitted memory as WDDM creates a naked
-    #commit charge for all VRAM used just incase it wants to page it all out. This just
-    #isn't realistic so "overcommit" on our calculations by just subtracting it off.
-
-    pi = PERFORMANCE_INFORMATION()
-    pi.cb = ctypes.sizeof(pi)
-
-    if not psapi.GetPerformanceInfo(ctypes.byref(pi), pi.cb):
-        logging.warning("WARNING: Failed to query windows performance info. RAM usage may be sub optimal")
-        return psutil.virtual_memory().available
-
-    committed = pi.CommitTotal * pi.PageSize
-    total = pi.PhysicalTotal * pi.PageSize
-
-    return max(psutil.virtual_memory().available,
-               total - (committed - comfy_aimdo.control.get_total_vram_usage()))
-

From 8187cd783e20ac71cbf88d51338d632996838cb3 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 7 May 2026 19:55:41 +1000
Subject: [PATCH 06/23] Implement JIT pinned memory pressure

Replace the predictive pin pressure mechanism with JIT PIN memory
pressure.
---
 comfy/model_management.py | 52 ++++++++++++++++++++++++++++++++++-----
 comfy/model_patcher.py    | 17 ++++++++++---
 comfy/ops.py              |  8 ++++--
 comfy/pinned_memory.py    |  4 +--
 4 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 4b96d1492e4b..6a2126cb59bb 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -498,6 +498,8 @@ def get_torch_device_name(device):
 
 DIRTY_MMAPS = set()
 
+PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024
+
 def module_size(module):
     module_mem = 0
     sd = module.state_dict()
@@ -511,6 +513,21 @@ def mark_mmap_dirty(storage):
     if mmap_refs is not None:
         DIRTY_MMAPS.add(mmap_refs[0])
 
+def ensure_pin_budget(size, evict_active=False):
+    if MAX_PINNED_MEMORY <= 0:
+        return
+
+    shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
+    if shortfall <= 0:
+        return
+
+    shortfall += PIN_PRESSURE_HYSTERESIS
+    for loaded_model in reversed(current_loaded_models):
+        model = loaded_model.model
+        if model is not None and model.is_dynamic() and (evict_active or not model.dynamic_pins[model.load_device]["active"]):
+            shortfall -= model.partially_unload_ram(shortfall)
+            if shortfall <= 0:
+                break
 
 class LoadedModel:
     def __init__(self, model):
@@ -1133,7 +1150,6 @@ def current_stream(device):
 STREAM_PIN_BUFFERS = {}
 
 DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3
-DEFAULT_PIN_BUFFER_PRIME_SIZE = 1024 ** 2
 
 def get_cast_buffer(offload_stream, device, size, ref):
     global LARGEST_CASTED_WEIGHT
@@ -1177,14 +1193,29 @@ def get_aimdo_cast_buffer(offload_stream, device):
 def get_pin_buffer(offload_stream):
     pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None)
     if pin_buffer is None:
-        # A small non-zero default primes HostBuffer's larger virtual reservation.
-        pin_buffer = comfy_aimdo.host_buffer.HostBuffer(DEFAULT_PIN_BUFFER_PRIME_SIZE)
+        pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0)
         STREAM_PIN_BUFFERS[offload_stream] = pin_buffer
     elif offload_stream is not None:
         offload_stream.synchronize()
     return pin_buffer
 
+def resize_pin_buffer(pin_buffer, size):
+    global TOTAL_PINNED_MEMORY
+    old_size = getattr(pin_buffer, "_comfy_stream_pin_size", 0)
+    if size <= old_size:
+        return True
+    growth = size - old_size
+    ensure_pin_budget(growth, evict_active=True)
+    try:
+        pin_buffer.extend(size=size, reallocate=True)
+    except RuntimeError:
+        return False
+    pin_buffer._comfy_stream_pin_size = size
+    TOTAL_PINNED_MEMORY += growth
+    return True
+
 def reset_cast_buffers():
+    global TOTAL_PINNED_MEMORY
     global LARGEST_CASTED_WEIGHT
     global LARGEST_AIMDO_CASTED_WEIGHT
 
@@ -1197,8 +1228,18 @@ def reset_cast_buffers():
 
     for mmap_obj in DIRTY_MMAPS:
         mmap_obj.bounce()
-
     DIRTY_MMAPS.clear()
+
+    for pin_buffer in STREAM_PIN_BUFFERS.values():
+        TOTAL_PINNED_MEMORY -= getattr(pin_buffer, "_comfy_stream_pin_size", 0)
+    if TOTAL_PINNED_MEMORY < 0:
+        TOTAL_PINNED_MEMORY = 0
+
+    for loaded_model in current_loaded_models:
+        model = loaded_model.model
+        if model is not None and model.is_dynamic():
+            model.dynamic_pins[model.load_device]["active"] = False
+
     STREAM_CAST_BUFFERS.clear()
     STREAM_AIMDO_CAST_BUFFERS.clear()
     STREAM_PIN_BUFFERS.clear()
@@ -1344,8 +1385,7 @@ def pin_memory(tensor):
         return False
 
     size = tensor.nbytes
-    if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY:
-        return False
+    ensure_pin_budget(size)
 
     ptr = tensor.data_ptr()
     if ptr == 0:
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 43712c7a0359..def0901dcd4c 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1117,7 +1117,7 @@ def pinned_memory_size(self):
         return 0
 
     def partially_unload_ram(self, ram_to_unload):
-        pass
+        return 0
 
     def detach(self, unpatch_all=True):
         self.eject_model()
@@ -1544,7 +1544,12 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up
         if not hasattr(self.model, "dynamic_pins"):
             self.model.dynamic_pins = {}
         if self.load_device not in self.model.dynamic_pins:
-            self.model.dynamic_pins[self.load_device] = {"hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), "stack": [], "failed": False}
+            self.model.dynamic_pins[self.load_device] = {
+                "hostbuf": comfy_aimdo.host_buffer.HostBuffer(0),
+                "stack": [],
+                "failed": False,
+                "active": False,
+            }
         self.non_dynamic_delegate_model = None
         assert load_device is not None
 
@@ -1608,6 +1613,7 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False
             vbar = self._vbar_get(create=True)
             pin_state = self.model.dynamic_pins[self.load_device]
             pin_state["failed"] = False
+            pin_state["active"] = True
             if vbar is not None:
                 vbar.prioritize()
 
@@ -1741,9 +1747,10 @@ def pinned_memory_size(self):
         return self.model.dynamic_pins[self.load_device]["hostbuf"].size
 
     def partially_unload_ram(self, ram_to_unload):
+        freed = 0
         pin_state = self.model.dynamic_pins[self.load_device]
         hostbuf = pin_state["hostbuf"]
-        stack = self.model.dynamic_pins[self.load_device]["stack"]
+        stack = pin_state["stack"]
         while len(stack) > 0:
             module, offset = stack.pop()
             size = module._pin.numel() * module._pin.element_size()
@@ -1752,9 +1759,11 @@ def partially_unload_ram(self, ram_to_unload):
             comfy.model_management.TOTAL_PINNED_MEMORY -= size
             if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
                 comfy.model_management.TOTAL_PINNED_MEMORY = 0
+            freed += size
             ram_to_unload -= size
             if ram_to_unload <= 0:
-                return
+                return freed
+        return freed
 
     def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
         #This isn't used by the core at all and can only be to load a model out of
diff --git a/comfy/ops.py b/comfy/ops.py
index 3d196f43877a..ee3184894305 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -138,6 +138,8 @@ def get_stream_pin_buffer_offset(buffer_size):
 
         if stream_pin_hostbuf is None:
             stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream)
+            if stream_pin_hostbuf is None:
+                return None
 
         offset = stream_pin_offset
         stream_pin_offset += buffer_size
@@ -213,8 +215,10 @@ def get_stream_pin_buffer_offset(buffer_size):
         stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size)
         if stream_pin_hostbuf_size < stream_pin_offset:
             stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM
-            stream_pin_hostbuf.extend(size=stream_pin_hostbuf_size, reallocate=True)
-            stream_pin_hostbuf._comfy_stream_pin_size = stream_pin_hostbuf_size
+            if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_hostbuf_size):
+                for xfer_source, _, _, xfer_dest in stream_pin_queue:
+                    comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+                return offload_stream
         stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset)
         stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf
         for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue:
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
index 3638066c8825..a35759aad382 100644
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -16,9 +16,7 @@ def pin_memory(module):
     hostbuf = pin_state["hostbuf"]
     size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
     offset = hostbuf.size
-    if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY:
-        pin_state["failed"] = True
-        return False
+    comfy.model_management.ensure_pin_budget(size)
 
     try:
         hostbuf.extend(size=size)

From 2b927e17838b733dde6a660fe521bb0f13768528 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 7 May 2026 23:50:37 +1000
Subject: [PATCH 07/23] LowVRAMPatch: change to two-phase visit

---
 comfy/lora.py          | 19 +++++++++++++------
 comfy/model_patcher.py | 11 +++++++++--
 comfy/ops.py           |  2 +-
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/comfy/lora.py b/comfy/lora.py
index db8f16bcb5ae..f7c7c21a5847 100644
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -475,16 +475,23 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
 
     return weight
 
-def prefetch_prepared_value(value, allocate_buffer, stream):
+def prefetch_prepared_value(value, counter, destination, stream):
     if isinstance(value, torch.Tensor):
-        dest = allocate_buffer(comfy.memory_management.vram_aligned_size(value))
-        comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
+        size = comfy.memory_management.vram_aligned_size(value)
+        offset = counter[0]
+        counter[0] += size
+        if destination is None:
+            return value
+
+        dest = destination[offset:offset + size]
+        if stream is not None:
+            comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
         return comfy.memory_management.interpret_gathered_like([value], dest)[0]
     elif isinstance(value, weight_adapter.WeightAdapterBase):
-        return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, allocate_buffer, stream))
+        return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream))
     elif isinstance(value, tuple):
-        return tuple(prefetch_prepared_value(item, allocate_buffer, stream) for item in value)
+        return tuple(prefetch_prepared_value(item, counter, destination, stream) for item in value)
     elif isinstance(value, list):
-        return [prefetch_prepared_value(item, allocate_buffer, stream) for item in value]
+        return [prefetch_prepared_value(item, counter, destination, stream) for item in value]
 
     return value
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index def0901dcd4c..dc58cd42ebcc 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -125,9 +125,16 @@ def __init__(self, key, patches, convert_func=None, set_func=None):
         self.set_func = set_func
         self.prepared_patches = None
 
-    def prepare(self, allocate_buffer, stream):
+    def memory_required(self):
+        counter = [0]
+        for patch in self.patches[self.key]:
+            comfy.lora.prefetch_prepared_value(patch[1], counter, None, None)
+        return counter[0]
+
+    def prepare(self, destination, stream):
+        counter = [0]
         self.prepared_patches = [
-            (patch[0], comfy.lora.prefetch_prepared_value(patch[1], allocate_buffer, stream), patch[2], patch[3], patch[4])
+            (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream), patch[2], patch[3], patch[4])
             for patch in self.patches[self.key]
         ]
 
diff --git a/comfy/ops.py b/comfy/ops.py
index ee3184894305..bd3de3677818 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -203,7 +203,7 @@ def get_stream_pin_buffer_offset(buffer_size):
             lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
             if lowvram_fn is not None:
                 ensure_offload_stream(s, cast_buffer_offset, False)
-                lowvram_fn.prepare(lambda size: get_cast_buffer(size), offload_stream)
+                lowvram_fn.prepare(get_cast_buffer(lowvram_fn.memory_required()), offload_stream)
 
         prefetch["xfer_dest"] = xfer_dest
         prefetch["cast_dest"] = cast_dest

From 8e473d756f39c5cac5397a8c3b4442e75617068c Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Fri, 8 May 2026 08:19:47 +1000
Subject: [PATCH 08/23] lora: re-implement as inplace swiss-army-knife
 operation

---
 comfy/lora.py             | 10 +++++-----
 comfy/model_management.py |  7 +++----
 comfy/model_patcher.py    | 13 +++++++++----
 comfy/ops.py              | 25 ++++++++++++++++++-------
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/comfy/lora.py b/comfy/lora.py
index f7c7c21a5847..2b8699710612 100644
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -475,7 +475,7 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
 
     return weight
 
-def prefetch_prepared_value(value, counter, destination, stream):
+def prefetch_prepared_value(value, counter, destination, stream, copy):
     if isinstance(value, torch.Tensor):
         size = comfy.memory_management.vram_aligned_size(value)
         offset = counter[0]
@@ -484,14 +484,14 @@ def prefetch_prepared_value(value, counter, destination, stream):
             return value
 
         dest = destination[offset:offset + size]
-        if stream is not None:
+        if copy:
             comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
         return comfy.memory_management.interpret_gathered_like([value], dest)[0]
     elif isinstance(value, weight_adapter.WeightAdapterBase):
-        return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream))
+        return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream, copy))
     elif isinstance(value, tuple):
-        return tuple(prefetch_prepared_value(item, counter, destination, stream) for item in value)
+        return tuple(prefetch_prepared_value(item, counter, destination, stream, copy) for item in value)
     elif isinstance(value, list):
-        return [prefetch_prepared_value(item, counter, destination, stream) for item in value]
+        return [prefetch_prepared_value(item, counter, destination, stream, copy) for item in value]
 
     return value
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 6a2126cb59bb..40f72fa1bd86 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1201,7 +1201,7 @@ def get_pin_buffer(offload_stream):
 
 def resize_pin_buffer(pin_buffer, size):
     global TOTAL_PINNED_MEMORY
-    old_size = getattr(pin_buffer, "_comfy_stream_pin_size", 0)
+    old_size = pin_buffer.size
     if size <= old_size:
         return True
     growth = size - old_size
@@ -1210,8 +1210,7 @@ def resize_pin_buffer(pin_buffer, size):
         pin_buffer.extend(size=size, reallocate=True)
     except RuntimeError:
         return False
-    pin_buffer._comfy_stream_pin_size = size
-    TOTAL_PINNED_MEMORY += growth
+    TOTAL_PINNED_MEMORY += pin_buffer.size - old_size
     return True
 
 def reset_cast_buffers():
@@ -1231,7 +1230,7 @@ def reset_cast_buffers():
     DIRTY_MMAPS.clear()
 
     for pin_buffer in STREAM_PIN_BUFFERS.values():
-        TOTAL_PINNED_MEMORY -= getattr(pin_buffer, "_comfy_stream_pin_size", 0)
+        TOTAL_PINNED_MEMORY -= pin_buffer.size
     if TOTAL_PINNED_MEMORY < 0:
         TOTAL_PINNED_MEMORY = 0
 
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index dc58cd42ebcc..a88603df95f7 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -118,6 +118,8 @@ def string_to_seed(data):
     return comfy.utils.string_to_seed(data)
 
 class LowVramPatch:
+    is_lowvram_patch = True
+
     def __init__(self, key, patches, convert_func=None, set_func=None):
         self.key = key
         self.patches = patches
@@ -128,15 +130,18 @@ def __init__(self, key, patches, convert_func=None, set_func=None):
     def memory_required(self):
         counter = [0]
         for patch in self.patches[self.key]:
-            comfy.lora.prefetch_prepared_value(patch[1], counter, None, None)
+            comfy.lora.prefetch_prepared_value(patch[1], counter, None, None, False)
         return counter[0]
 
-    def prepare(self, destination, stream):
+    def prepare(self, destination, stream, copy=True, commit=True):
         counter = [0]
-        self.prepared_patches = [
-            (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream), patch[2], patch[3], patch[4])
+        prepared_patches = [
+            (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream, copy), patch[2], patch[3], patch[4])
             for patch in self.patches[self.key]
         ]
+        if commit:
+            self.prepared_patches = prepared_patches
+        return prepared_patches
 
     def clear_prepared(self):
         self.prepared_patches = None
diff --git a/comfy/ops.py b/comfy/ops.py
index bd3de3677818..8603b50a66c7 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -203,7 +203,14 @@ def get_stream_pin_buffer_offset(buffer_size):
             lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
             if lowvram_fn is not None:
                 ensure_offload_stream(s, cast_buffer_offset, False)
-                lowvram_fn.prepare(get_cast_buffer(lowvram_fn.memory_required()), offload_stream)
+                lowvram_size = lowvram_fn.memory_required()
+                lowvram_dest = get_cast_buffer(lowvram_size)
+                lowvram_fn.prepare(lowvram_dest, None, copy=False, commit=True)
+                pin_offset = get_stream_pin_buffer_offset(lowvram_size)
+                if pin_offset is not None:
+                    stream_pin_queue.append((lowvram_fn, pin_offset, lowvram_size, lowvram_dest))
+                else:
+                    lowvram_fn.prepare(lowvram_dest, offload_stream, copy=True, commit=True)
 
         prefetch["xfer_dest"] = xfer_dest
         prefetch["cast_dest"] = cast_dest
@@ -211,19 +218,23 @@ def get_stream_pin_buffer_offset(buffer_size):
         prefetch["needs_cast"] = needs_cast
         s._prefetch = prefetch
 
+    def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream):
+        if getattr(xfer_source, "is_lowvram_patch", False):
+            xfer_source.prepare(xfer_dest, stream, copy=True, commit=False)
+        else:
+            comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream)
+
     if stream_pin_offset > 0:
-        stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size)
-        if stream_pin_hostbuf_size < stream_pin_offset:
-            stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM
-            if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_hostbuf_size):
+        if stream_pin_hostbuf.size < stream_pin_offset:
+            if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM):
                 for xfer_source, _, _, xfer_dest in stream_pin_queue:
-                    comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+                    cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream)
                 return offload_stream
         stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset)
         stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf
         for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue:
             pin = stream_pin_tensor[pin_offset:pin_offset + pin_size]
-            comfy.model_management.cast_to_gathered(xfer_source, pin)
+            cast_maybe_lowvram_patch(xfer_source, pin, None)
             comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream)
 
     return offload_stream

From e48dace1452df67a3661bcf6d5144e4a7aa8f867 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Fri, 8 May 2026 09:47:08 +1000
Subject: [PATCH 09/23] prepare for multiple pin sets

---
 comfy/model_management.py |  2 ++
 comfy/model_patcher.py    | 37 +++++++++++++++++++------------------
 comfy/pinned_memory.py    | 13 +++++++------
 3 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 40f72fa1bd86..ca4318a4535e 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1238,6 +1238,8 @@ def reset_cast_buffers():
         model = loaded_model.model
         if model is not None and model.is_dynamic():
             model.dynamic_pins[model.load_device]["active"] = False
+            model.partially_unload_ram(1e30, subsets=[ "patches" ])
+            model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0), [])
 
     STREAM_CAST_BUFFERS.clear()
     STREAM_AIMDO_CAST_BUFFERS.clear()
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index a88603df95f7..530db214cc9c 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1557,8 +1557,8 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up
             self.model.dynamic_pins = {}
         if self.load_device not in self.model.dynamic_pins:
             self.model.dynamic_pins[self.load_device] = {
-                "hostbuf": comfy_aimdo.host_buffer.HostBuffer(0),
-                "stack": [],
+                "weights": (comfy_aimdo.host_buffer.HostBuffer(0), []),
+                "patches": (comfy_aimdo.host_buffer.HostBuffer(0), []),
                 "failed": False,
                 "active": False,
             }
@@ -1756,25 +1756,26 @@ def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=Fals
         return freed
 
     def pinned_memory_size(self):
-        return self.model.dynamic_pins[self.load_device]["hostbuf"].size
+        return (self.model.dynamic_pins[self.load_device]["weights"][0].size +
+                self.model.dynamic_pins[self.load_device]["patches"][0].size)
 
-    def partially_unload_ram(self, ram_to_unload):
+    def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]):
         freed = 0
         pin_state = self.model.dynamic_pins[self.load_device]
-        hostbuf = pin_state["hostbuf"]
-        stack = pin_state["stack"]
-        while len(stack) > 0:
-            module, offset = stack.pop()
-            size = module._pin.numel() * module._pin.element_size()
-            del module._pin
-            hostbuf.truncate(offset)
-            comfy.model_management.TOTAL_PINNED_MEMORY -= size
-            if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
-                comfy.model_management.TOTAL_PINNED_MEMORY = 0
-            freed += size
-            ram_to_unload -= size
-            if ram_to_unload <= 0:
-                return freed
+        for subset in subsets:
+            hostbuf, stack = pin_state[subset]
+            while len(stack) > 0:
+                module, offset = stack.pop()
+                size = module._pin.numel() * module._pin.element_size()
+                del module._pin
+                hostbuf.truncate(offset)
+                comfy.model_management.TOTAL_PINNED_MEMORY -= size
+                if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
+                    comfy.model_management.TOTAL_PINNED_MEMORY = 0
+                freed += size
+                ram_to_unload -= size
+                if ram_to_unload <= 0:
+                    return freed
         return freed
 
     def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
index a35759aad382..208c777f8f33 100644
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -5,16 +5,17 @@
 
 from comfy.cli_args import args
 
-def get_pin(module):
+def get_pin(module, subset="weights"):
     return getattr(module, "_pin", None)
 
-def pin_memory(module):
+def pin_memory(module, subset="weights", size=None):
     pin_state = module._pin_state
-    if pin_state["failed"] or args.disable_pinned_memory or get_pin(module) is not None:
+    if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None:
         return
 
-    hostbuf = pin_state["hostbuf"]
-    size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
+    hostbuf, stack = pin_state[subset]
+    if size is None:
+        size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
     offset = hostbuf.size
     comfy.model_management.ensure_pin_budget(size)
 
@@ -26,6 +27,6 @@ def pin_memory(module):
 
     module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
     module._pin.untyped_storage()._comfy_hostbuf = hostbuf
-    pin_state["stack"].append((module, offset))
+    stack.append((module, offset))
     comfy.model_management.TOTAL_PINNED_MEMORY += size
     return True

From 3a3b75a7e3cc1175e3f9f0d90c5838fb83c9b518 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Fri, 8 May 2026 23:28:43 +1000
Subject: [PATCH 10/23] implement pinned loras

---
 comfy/model_management.py |  6 ++---
 comfy/model_patcher.py    |  8 +++---
 comfy/ops.py              | 55 +++++++++++++++++++++------------------
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index ca4318a4535e..145a32080605 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -524,7 +524,7 @@ def ensure_pin_budget(size, evict_active=False):
     shortfall += PIN_PRESSURE_HYSTERESIS
     for loaded_model in reversed(current_loaded_models):
         model = loaded_model.model
-        if model is not None and model.is_dynamic() and (evict_active or not model.dynamic_pins[model.load_device]["active"]):
+        if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
             shortfall -= model.partially_unload_ram(shortfall)
             if shortfall <= 0:
                 break
@@ -1237,9 +1237,9 @@ def reset_cast_buffers():
     for loaded_model in current_loaded_models:
         model = loaded_model.model
         if model is not None and model.is_dynamic():
-            model.dynamic_pins[model.load_device]["active"] = False
+            model.model.dynamic_pins[model.load_device]["active"] = False
             model.partially_unload_ram(1e30, subsets=[ "patches" ])
-            model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0), [])
+            model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [])
 
     STREAM_CAST_BUFFERS.clear()
     STREAM_AIMDO_CAST_BUFFERS.clear()
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 530db214cc9c..f4845bb43c03 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1557,8 +1557,8 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up
             self.model.dynamic_pins = {}
         if self.load_device not in self.model.dynamic_pins:
             self.model.dynamic_pins[self.load_device] = {
-                "weights": (comfy_aimdo.host_buffer.HostBuffer(0), []),
-                "patches": (comfy_aimdo.host_buffer.HostBuffer(0), []),
+                "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []),
+                "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []),
                 "failed": False,
                 "active": False,
             }
@@ -1651,7 +1651,9 @@ def setup_param(self, m, n, param_key):
                     if key in self.patches:
                         if comfy.lora.calculate_shape(self.patches[key], weight, key) != weight.shape:
                             return (True, 0)
-                        setattr(m, param_key + "_lowvram_function", LowVramPatch(key, self.patches))
+                        lowvram_patch = LowVramPatch(key, self.patches)
+                        lowvram_patch._pin_state = pin_state
+                        setattr(m, param_key + "_lowvram_function", lowvram_patch)
                         num_patches += 1
                     else:
                         setattr(m, param_key + "_lowvram_function", None)
diff --git a/comfy/ops.py b/comfy/ops.py
index 8603b50a66c7..629b54e4cf9c 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -183,34 +183,45 @@ def get_stream_pin_buffer_offset(buffer_size):
         if xfer_dest is None:
             xfer_dest = get_cast_buffer(dest_size)
 
-        if pin is None:
+        def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream):
+            if xfer_source is not None:
+                if getattr(xfer_source, "is_lowvram_patch", False):
+                    xfer_source.prepare(xfer_dest, stream, copy=True, commit=False)
+                else:
+                    comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream)
+
+        def handle_pin_miss(m, source, dest, subset="weights", size=None):
+            pin = None
             if signature is None:
-                comfy.pinned_memory.pin_memory(s)
-                pin = comfy.pinned_memory.get_pin(s)
+                comfy.pinned_memory.pin_memory(m, subset=subset, size=size)
+                pin = comfy.pinned_memory.get_pin(m, subset=subset)
                 if pin is not None:
-                    comfy.model_management.cast_to_gathered(xfer_source, pin)
-                    xfer_source = [ pin ]
+                    cast_maybe_lowvram_patch(source, pin, None)
+                    return [ pin ]
             if pin is None:
-                pin_offset = get_stream_pin_buffer_offset(dest_size)
+                pin_offset = get_stream_pin_buffer_offset(size)
                 if pin_offset is not None:
-                    stream_pin_queue.append((xfer_source, pin_offset, dest_size, xfer_dest))
-                    xfer_source = None
+                    stream_pin_queue.append((source, pin_offset, size, dest))
+                    return None
+            return source
 
-        if xfer_source is not None:
-            comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+        if pin is None:
+            xfer_source = handle_pin_miss(s, xfer_source, xfer_dest, size=dest_size)
+
+        cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream)
 
         for param_key in ("weight", "bias"):
-            lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
-            if lowvram_fn is not None:
+            lowvram_source = getattr(s, param_key + "_lowvram_function", None)
+            if lowvram_source is not None:
                 ensure_offload_stream(s, cast_buffer_offset, False)
-                lowvram_size = lowvram_fn.memory_required()
+                lowvram_size = lowvram_source.memory_required()
                 lowvram_dest = get_cast_buffer(lowvram_size)
-                lowvram_fn.prepare(lowvram_dest, None, copy=False, commit=True)
-                pin_offset = get_stream_pin_buffer_offset(lowvram_size)
-                if pin_offset is not None:
-                    stream_pin_queue.append((lowvram_fn, pin_offset, lowvram_size, lowvram_dest))
-                else:
-                    lowvram_fn.prepare(lowvram_dest, offload_stream, copy=True, commit=True)
+                lowvram_source.prepare(lowvram_dest, None, copy=False, commit=True)
+
+                pin = comfy.pinned_memory.get_pin(lowvram_source, subset="patches")
+                lowvram_source = handle_pin_miss(lowvram_source, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size) if pin is None else [ pin ]
+
+                cast_maybe_lowvram_patch(lowvram_source, lowvram_dest, offload_stream)
 
         prefetch["xfer_dest"] = xfer_dest
         prefetch["cast_dest"] = cast_dest
@@ -218,12 +229,6 @@ def get_stream_pin_buffer_offset(buffer_size):
         prefetch["needs_cast"] = needs_cast
         s._prefetch = prefetch
 
-    def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream):
-        if getattr(xfer_source, "is_lowvram_patch", False):
-            xfer_source.prepare(xfer_dest, stream, copy=True, commit=False)
-        else:
-            comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream)
-
     if stream_pin_offset > 0:
         if stream_pin_hostbuf.size < stream_pin_offset:
             if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM):

From c395f2d5b7ec83b22c597e0e0d936e3cc35f822e Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Fri, 8 May 2026 23:30:47 +1000
Subject: [PATCH 11/23] requirements: comfy-aimdo 0.4.0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c5a6f4cec2ce..eba0fc5ca757 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
-comfy-aimdo==0.3.0
+comfy-aimdo==0.4.0
 requests
 simpleeval>=1.0.0
 blake3

From 44c0a0602b575287b48f09cdb11e6969683e39da Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Mon, 11 May 2026 18:51:39 +1000
Subject: [PATCH 12/23] ops: remove unused arg

This was defeatured in aimdo iteration
---
 comfy/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy/ops.py b/comfy/ops.py
index 629b54e4cf9c..d425ea7eb3dc 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -235,7 +235,7 @@ def handle_pin_miss(m, source, dest, subset="weights", size=None):
                 for xfer_source, _, _, xfer_dest in stream_pin_queue:
                     cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream)
                 return offload_stream
-        stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset)
+        stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf)
         stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf
         for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue:
             pin = stream_pin_tensor[pin_offset:pin_offset + pin_size]

From ee927aafa8770a4bea8cfc2fcbedba5f86656097 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Sat, 9 May 2026 18:41:57 +1000
Subject: [PATCH 13/23] ops: sync the CPU with only the offload stream activity

This was syncing with the offload stream which itself is synced with the
compute stream, so this was syncing CPU with compute transitively. Define
the event to sync it more gently.
---
 comfy/model_management.py | 5 ++++-
 comfy/ops.py              | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 145a32080605..c1d0901fc61d 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1196,7 +1196,10 @@ def get_pin_buffer(offload_stream):
         pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0)
         STREAM_PIN_BUFFERS[offload_stream] = pin_buffer
     elif offload_stream is not None:
-        offload_stream.synchronize()
+        event = getattr(pin_buffer, "_comfy_event", None)
+        if event is not None:
+            event.synchronize()
+            delattr(pin_buffer, "_comfy_event")
     return pin_buffer
 
 def resize_pin_buffer(pin_buffer, size):
diff --git a/comfy/ops.py b/comfy/ops.py
index d425ea7eb3dc..be744a030b05 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -241,6 +241,7 @@ def handle_pin_miss(m, source, dest, subset="weights", size=None):
             pin = stream_pin_tensor[pin_offset:pin_offset + pin_size]
             cast_maybe_lowvram_patch(xfer_source, pin, None)
             comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+        stream_pin_hostbuf._comfy_event = offload_stream.record_event()
 
     return offload_stream
 

From d61026d020946b986a3a4a1969d9198c90b7e8ec Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Wed, 13 May 2026 09:17:23 +1000
Subject: [PATCH 14/23] pins: implement freeing intermediate for pinned memory

Pinning is more important than inactive intermediates and the stream
pin buffer is more important than even active intermediates.
---
 comfy/memory_management.py | 4 ++--
 comfy/model_management.py  | 2 ++
 comfy/pinned_memory.py     | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/comfy/memory_management.py b/comfy/memory_management.py
index 4a628b05c18f..7645064f59a6 100644
--- a/comfy/memory_management.py
+++ b/comfy/memory_management.py
@@ -157,7 +157,7 @@ def set_ram_cache_release_state(callback, headroom):
     extra_ram_release_callback = callback
     RAM_CACHE_HEADROOM = max(0, int(headroom))
 
-def extra_ram_release(target):
+def extra_ram_release(target, free_active=False):
     if extra_ram_release_callback is None:
         return 0
-    return extra_ram_release_callback(target)
+    return extra_ram_release_callback(target, free_active=free_active)
diff --git a/comfy/model_management.py b/comfy/model_management.py
index c1d0901fc61d..697359d3a07d 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1208,6 +1208,7 @@ def resize_pin_buffer(pin_buffer, size):
     if size <= old_size:
         return True
     growth = size - old_size
+    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True)
     ensure_pin_budget(growth, evict_active=True)
     try:
         pin_buffer.extend(size=size, reallocate=True)
@@ -1389,6 +1390,7 @@ def pin_memory(tensor):
         return False
 
     size = tensor.nbytes
+    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
     ensure_pin_budget(size)
 
     ptr = tensor.data_ptr()
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
index 208c777f8f33..35cbbcd9e215 100644
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -17,6 +17,7 @@ def pin_memory(module, subset="weights", size=None):
     if size is None:
         size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
     offset = hostbuf.size
+    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
     comfy.model_management.ensure_pin_budget(size)
 
     try:

From 3f717816e1f194fa2a9a105fb425b7bbfbb781f7 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Wed, 13 May 2026 21:57:35 +1000
Subject: [PATCH 15/23] execution: implement pin eviction on RAM presure

Add back proper pin freeing on RAM pressure
---
 comfy/model_management.py | 19 ++++++++++++-------
 execution.py              |  5 ++++-
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 697359d3a07d..f358621c9b83 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -513,6 +513,17 @@ def mark_mmap_dirty(storage):
     if mmap_refs is not None:
         DIRTY_MMAPS.add(mmap_refs[0])
 
+def free_pins(size, evict_active=False):
+    if size <= 0:
+        return
+
+    for loaded_model in reversed(current_loaded_models):
+        model = loaded_model.model
+        if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
+            size -= model.partially_unload_ram(size)
+            if size <= 0:
+                break
+
 def ensure_pin_budget(size, evict_active=False):
     if MAX_PINNED_MEMORY <= 0:
         return
@@ -521,13 +532,7 @@ def ensure_pin_budget(size, evict_active=False):
     if shortfall <= 0:
         return
 
-    shortfall += PIN_PRESSURE_HYSTERESIS
-    for loaded_model in reversed(current_loaded_models):
-        model = loaded_model.model
-        if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
-            shortfall -= model.partially_unload_ram(shortfall)
-            if shortfall <= 0:
-                break
+    free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active)
 
 class LoadedModel:
     def __init__(self, model):
diff --git a/execution.py b/execution.py
index f37d0360d55c..5605f09e7d1d 100644
--- a/execution.py
+++ b/execution.py
@@ -2,6 +2,7 @@
 import heapq
 import inspect
 import logging
+import psutil
 import sys
 import threading
 import time
@@ -780,7 +781,9 @@ async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=
                         execution_list.complete_node_execution()
 
                     if self.cache_type == CacheType.RAM_PRESSURE:
-                        comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom)
+                        ram_release_callback(ram_headroom)
+                        ram_shortfall = ram_headroom - psutil.virtual_memory().available
+                        comfy.model_management.free_pins(ram_shortfall)
                         ram_release_callback(ram_headroom, free_active=True)
                 else:
                     # Only execute when the while-loop ends without break

From 31150538b0a75734e46bc1f7a0d2bb1fce5d1fa9 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Fri, 15 May 2026 00:28:13 +1000
Subject: [PATCH 16/23] implement pin registration swaps

Uncap the windows pins from 50% by extending the pool and have a pressure
mechanism to move the pin reservations om demand.

This unfortunately implies a GPU sync to do the freeing so significant
hysterisis needs to be added to consolidate these pressure events.
---
 comfy/model_management.py | 49 +++++++++++++++++++++++++++++----------
 comfy/model_patcher.py    | 39 +++++++++++++++++++++++++------
 comfy/pinned_memory.py    | 27 +++++++++++++++++++--
 3 files changed, 94 insertions(+), 21 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index f358621c9b83..19a9163620e6 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -498,7 +498,11 @@ def get_torch_device_name(device):
 
 DIRTY_MMAPS = set()
 
-PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024
+PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024
+
+#Freeing registerables on pressure does imply a GPU sync, so go big on
+#the hysteresis so each expensive sync gives us back a good chunk.
+REGISTERABLE_PIN_HYSTERESIS = 768 * 1024 * 1024
 
 def module_size(module):
     module_mem = 0
@@ -525,15 +529,28 @@ def free_pins(size, evict_active=False):
                 break
 
 def ensure_pin_budget(size, evict_active=False):
-    if MAX_PINNED_MEMORY <= 0:
+    if MAX_MODEL_MEMORY <= 0:
         return
 
-    shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
+    shortfall = TOTAL_MODEL_MEMORY + size - MAX_MODEL_MEMORY
     if shortfall <= 0:
         return
 
     free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active)
 
+def ensure_pin_registerable(size, evict_active=False):
+    shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
+    if MAX_PINNED_MEMORY <= 0 or shortfall <= 0:
+        return
+
+    shortfall += REGISTERABLE_PIN_HYSTERESIS
+    for loaded_model in reversed(current_loaded_models):
+        model = loaded_model.model
+        if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
+            shortfall -= model.unregister_inactive_pins(shortfall)
+            if shortfall <= 0:
+                return
+
 class LoadedModel:
     def __init__(self, model):
         self._set_model(model)
@@ -1208,22 +1225,24 @@ def get_pin_buffer(offload_stream):
     return pin_buffer
 
 def resize_pin_buffer(pin_buffer, size):
-    global TOTAL_PINNED_MEMORY
+    global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY
     old_size = pin_buffer.size
     if size <= old_size:
         return True
     growth = size - old_size
     comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True)
     ensure_pin_budget(growth, evict_active=True)
+    ensure_pin_registerable(growth, evict_active=True)
     try:
         pin_buffer.extend(size=size, reallocate=True)
     except RuntimeError:
         return False
+    TOTAL_MODEL_MEMORY += pin_buffer.size - old_size
     TOTAL_PINNED_MEMORY += pin_buffer.size - old_size
     return True
 
 def reset_cast_buffers():
-    global TOTAL_PINNED_MEMORY
+    global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY
     global LARGEST_CASTED_WEIGHT
     global LARGEST_AIMDO_CASTED_WEIGHT
 
@@ -1239,16 +1258,17 @@ def reset_cast_buffers():
     DIRTY_MMAPS.clear()
 
     for pin_buffer in STREAM_PIN_BUFFERS.values():
+        TOTAL_MODEL_MEMORY -= pin_buffer.size
         TOTAL_PINNED_MEMORY -= pin_buffer.size
-    if TOTAL_PINNED_MEMORY < 0:
-        TOTAL_PINNED_MEMORY = 0
+    TOTAL_MODEL_MEMORY = max(0, TOTAL_MODEL_MEMORY)
+    TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY)
 
     for loaded_model in current_loaded_models:
         model = loaded_model.model
         if model is not None and model.is_dynamic():
             model.model.dynamic_pins[model.load_device]["active"] = False
             model.partially_unload_ram(1e30, subsets=[ "patches" ])
-            model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [])
+            model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1])
 
     STREAM_CAST_BUFFERS.clear()
     STREAM_AIMDO_CAST_BUFFERS.clear()
@@ -1352,14 +1372,18 @@ def cast_to_device(tensor, device, dtype, copy=False):
 
 
 PINNED_MEMORY = {}
+TOTAL_MODEL_MEMORY = 0
 TOTAL_PINNED_MEMORY = 0
+MAX_MODEL_MEMORY = -1
 MAX_PINNED_MEMORY = -1
 if not args.disable_pinned_memory:
     if is_nvidia() or is_amd():
+        ram = get_total_memory(torch.device("cpu"))
+        MAX_MODEL_MEMORY = min(ram - 4 * 1024 * 1024 * 1024, ram * 0.90)
         if WINDOWS:
-            MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40  # Windows limit is apparently 50%
+            MAX_PINNED_MEMORY = ram * 0.40  # Windows limit is apparently 50%
         else:
-            MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90
+            MAX_PINNED_MEMORY = ram * 0.90
         logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
 
 PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])
@@ -1396,7 +1420,7 @@ def pin_memory(tensor):
 
     size = tensor.nbytes
     comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
-    ensure_pin_budget(size)
+    ensure_pin_registerable(size)
 
     ptr = tensor.data_ptr()
     if ptr == 0:
@@ -1433,7 +1457,8 @@ def unpin_memory(tensor):
         return False
 
     if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
-        TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)
+        size = PINNED_MEMORY.pop(ptr)
+        TOTAL_PINNED_MEMORY -= size
         return True
     else:
         logging.warning("Unpin error.")
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index f4845bb43c03..7dc4d7801439 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1557,8 +1557,8 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up
             self.model.dynamic_pins = {}
         if self.load_device not in self.model.dynamic_pins:
             self.model.dynamic_pins[self.load_device] = {
-                "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []),
-                "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []),
+                "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), [], [-1]),
+                "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]),
                 "failed": False,
                 "active": False,
             }
@@ -1761,19 +1761,44 @@ def pinned_memory_size(self):
         return (self.model.dynamic_pins[self.load_device]["weights"][0].size +
                 self.model.dynamic_pins[self.load_device]["patches"][0].size)
 
+    def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]):
+        freed = 0
+        pin_state = self.model.dynamic_pins[self.load_device]
+        for subset in subsets:
+            hostbuf, stack, stack_split = pin_state[subset]
+            split = stack_split[0]
+            while split >= 0:
+                module, offset = stack[split]
+                split -= 1
+                stack_split[0] = split
+                if not module._pin_registered:
+                    continue
+                size = module._pin.numel() * module._pin.element_size()
+                if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0:
+                    comfy.model_management.discard_cuda_async_error()
+                    continue
+                module._pin_registered = False
+                comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
+                freed += size
+                ram_to_unload -= size
+                if ram_to_unload <= 0:
+                    return freed
+        return freed
+
     def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]):
         freed = 0
         pin_state = self.model.dynamic_pins[self.load_device]
         for subset in subsets:
-            hostbuf, stack = pin_state[subset]
+            hostbuf, stack, stack_split = pin_state[subset]
             while len(stack) > 0:
                 module, offset = stack.pop()
                 size = module._pin.numel() * module._pin.element_size()
                 del module._pin
-                hostbuf.truncate(offset)
-                comfy.model_management.TOTAL_PINNED_MEMORY -= size
-                if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
-                    comfy.model_management.TOTAL_PINNED_MEMORY = 0
+                hostbuf.truncate(offset, do_unregister=module._pin_registered)
+                stack_split[0] = min(stack_split[0], len(stack) - 1)
+                comfy.model_management.TOTAL_MODEL_MEMORY = max(0, comfy.model_management.TOTAL_MODEL_MEMORY - size)
+                if module._pin_registered:
+                    comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
                 freed += size
                 ram_to_unload -= size
                 if ram_to_unload <= 0:
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
index 35cbbcd9e215..8fe69916f988 100644
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -2,6 +2,7 @@
 import comfy.memory_management
 import comfy_aimdo.host_buffer
 import comfy_aimdo.torch
+import torch
 
 from comfy.cli_args import args
 
@@ -10,15 +11,33 @@ def get_pin(module, subset="weights"):
 
 def pin_memory(module, subset="weights", size=None):
     pin_state = module._pin_state
-    if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None:
+    if pin_state["failed"] or args.disable_pinned_memory:
         return
 
-    hostbuf, stack = pin_state[subset]
+    hostbuf, stack, stack_split = pin_state[subset]
+    pin = get_pin(module, subset)
+    if pin is not None:
+        if module._pin_registered:
+            return
+
+        size = module._pin.nbytes
+        comfy.model_management.ensure_pin_registerable(size)
+
+        if torch.cuda.cudart().cudaHostRegister(module._pin.data_ptr(), size, 1) != 0:
+            comfy.model_management.discard_cuda_async_error()
+            return False
+        module._pin_registered = True
+        stack_split[0] = max(stack_split[0], module._pin_stack_index)
+        comfy.model_management.TOTAL_PINNED_MEMORY += size
+        return True
+
     if size is None:
         size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
     offset = hostbuf.size
+
     comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
     comfy.model_management.ensure_pin_budget(size)
+    comfy.model_management.ensure_pin_registerable(size)
 
     try:
         hostbuf.extend(size=size)
@@ -29,5 +48,9 @@ def pin_memory(module, subset="weights", size=None):
     module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
     module._pin.untyped_storage()._comfy_hostbuf = hostbuf
     stack.append((module, offset))
+    module._pin_registered = True
+    module._pin_stack_index = len(stack) - 1
+    stack_split[0] = max(stack_split[0], module._pin_stack_index)
+    comfy.model_management.TOTAL_MODEL_MEMORY += size
     comfy.model_management.TOTAL_PINNED_MEMORY += size
     return True

From 18a74cb96ab6137f67229ffc0aa7e0f11a1e5ff3 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Wed, 13 May 2026 22:15:54 +1000
Subject: [PATCH 17/23] cli_args/execution: Implement lower background
 cache-ram threshold

Limit the amount of RAM background intermediates can use, so that
switching workflows doesn't degrade performance too much.
---
 comfy/cli_args.py |  7 ++++---
 execution.py      |  3 ++-
 main.py           | 14 ++++++++++----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 9dadb0093bf0..e0d7d4af4b48 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -110,13 +110,11 @@ def from_string(cls, value: str):
 
 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
 
-CACHE_RAM_AUTO_GB = -1.0
-
 cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
-cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")
+cache_group.add_argument("--cache-ram", nargs='*', type=float, default=None, metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
 
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@@ -246,6 +244,9 @@ def is_valid_directory(path: str) -> str:
 else:
     args = parser.parse_args([])
 
+if args.cache_ram is not None and len(args.cache_ram) > 2:
+    parser.error("--cache-ram accepts at most two values: active GB and inactive GB")
+
 if args.windows_standalone_build:
     args.auto_launch = True
 
diff --git a/execution.py b/execution.py
index 5605f09e7d1d..9c3968810631 100644
--- a/execution.py
+++ b/execution.py
@@ -728,6 +728,7 @@ async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=
 
         self._notify_prompt_lifecycle("start", prompt_id)
         ram_headroom = int(self.cache_args["ram"] * (1024 ** 3))
+        ram_inactive_headroom = int(self.cache_args["ram_inactive"] * (1024 ** 3))
         ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None
         comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom)
 
@@ -781,7 +782,7 @@ async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=
                         execution_list.complete_node_execution()
 
                     if self.cache_type == CacheType.RAM_PRESSURE:
-                        ram_release_callback(ram_headroom)
+                        ram_release_callback(ram_inactive_headroom)
                         ram_shortfall = ram_headroom - psutil.virtual_memory().available
                         comfy.model_management.free_pins(ram_shortfall)
                         ram_release_callback(ram_headroom, free_active=True)
diff --git a/main.py b/main.py
index a6fdaf43c7db..ad9742252be2 100644
--- a/main.py
+++ b/main.py
@@ -283,19 +283,25 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]:
 
 def prompt_worker(q, server_instance):
     current_time: float = 0.0
-    cache_ram = args.cache_ram
-    if cache_ram < 0:
+    cache_ram = 0
+    cache_ram_inactive = 0
+    if args.cache_ram is not None:
         cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0))
+        cache_ram_inactive = min(96.0, max(12.0, comfy.model_management.total_ram * 0.75 / 1024.0))
+        if len(args.cache_ram) > 0:
+            cache_ram = args.cache_ram[0]
+        if len(args.cache_ram) > 1:
+            cache_ram_inactive = args.cache_ram[1]
 
     cache_type = execution.CacheType.CLASSIC
     if args.cache_lru > 0:
         cache_type = execution.CacheType.LRU
-    elif cache_ram > 0:
+    elif max(cache_ram, cache_ram_inactive) > 0:
         cache_type = execution.CacheType.RAM_PRESSURE
     elif args.cache_none:
         cache_type = execution.CacheType.NONE
 
-    e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } )
+    e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram, "ram_inactive" : cache_ram_inactive } )
     last_gc_collect = 0
     need_gc = False
     gc_collect_interval = 10.0

From d8b442709a5607a3f82bcf5a03f2f81b1cdacadc Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Wed, 13 May 2026 22:23:37 +1000
Subject: [PATCH 18/23] make default

---
 comfy/cli_args.py |  2 +-
 main.py           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index e0d7d4af4b48..d5d13008b67c 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -111,10 +111,10 @@ def from_string(cls, value: str):
 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
 
 cache_group = parser.add_mutually_exclusive_group()
+cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
-cache_group.add_argument("--cache-ram", nargs='*', type=float, default=None, metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
 
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
diff --git a/main.py b/main.py
index ad9742252be2..1e47cab84cb7 100644
--- a/main.py
+++ b/main.py
@@ -285,7 +285,7 @@ def prompt_worker(q, server_instance):
     current_time: float = 0.0
     cache_ram = 0
     cache_ram_inactive = 0
-    if args.cache_ram is not None:
+    if not args.cache_classic and not args.cache_none and args.cache_lru <= 0:
         cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0))
         cache_ram_inactive = min(96.0, max(12.0, comfy.model_management.total_ram * 0.75 / 1024.0))
         if len(args.cache_ram) > 0:
@@ -293,11 +293,11 @@ def prompt_worker(q, server_instance):
         if len(args.cache_ram) > 1:
             cache_ram_inactive = args.cache_ram[1]
 
-    cache_type = execution.CacheType.CLASSIC
-    if args.cache_lru > 0:
+    cache_type = execution.CacheType.RAM_PRESSURE
+    if args.cache_classic:
+        cache_type = execution.CacheType.CLASSIC
+    elif args.cache_lru > 0:
         cache_type = execution.CacheType.LRU
-    elif max(cache_ram, cache_ram_inactive) > 0:
-        cache_type = execution.CacheType.RAM_PRESSURE
     elif args.cache_none:
         cache_type = execution.CacheType.NONE
 

From 55197d8bfc1cddd484e61dd01bfd35af8e49fa97 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Fri, 15 May 2026 13:19:57 +1000
Subject: [PATCH 19/23] bump aimdo

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index eba0fc5ca757..6754c94c4226 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
-comfy-aimdo==0.4.0
+comfy-aimdo==0.4.1
 requests
 simpleeval>=1.0.0
 blake3

From 0242954aaa24a5262cd59434a555deaf513129b9 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Fri, 15 May 2026 20:02:09 +1000
Subject: [PATCH 20/23] model-patcher: force-cast tiny weights

Flux 2 gets crazy stalls due to a mix of tiny and giant weights
creating lopsided steam buffer rotations which creates stalls.
---
 comfy/model_patcher.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 7dc4d7801439..e1cd3283123e 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1670,6 +1670,9 @@ def setup_param(self, m, n, param_key):
 
                 def force_load_param(self, param_key, device_to):
                     key = key_param_name_to_key(n, param_key)
+                    weight, _, _ = get_key_weight(self.model, key)
+                    if weight is None:
+                        return
                     if key in self.backup:
                         comfy.utils.set_attr_param(self.model, key, self.backup[key].weight)
                     self.patch_weight_to_device(key, device_to=device_to, force_cast=True)
@@ -1683,13 +1686,19 @@ def force_load_param(self, param_key, device_to):
                     m._pin_state = pin_state
                     set_dirty(m, dirty)
 
-                    force_load, v_weight_size = setup_param(self, m, n, "weight")
-                    force_load_bias, v_weight_bias = setup_param(self, m, n, "bias")
-                    force_load = force_load or force_load_bias
-                    v_weight_size += v_weight_bias
+                    #Models that mix tiny and giant weights can causing lopsided stream buffer
+                    #rotations and stall. force the tinys over.
+                    if module_mem > 16 * 1024:
+                        force_load, v_weight_size = setup_param(self, m, n, "weight")
+                        force_load_bias, v_weight_bias = setup_param(self, m, n, "bias")
+                        force_load = force_load or force_load_bias
+                        v_weight_size += v_weight_bias
+                        if force_load:
+                            logging.info(f"Module {n} has resizing Lora - force loading")
+                    else:
+                        force_load=True
 
                     if force_load:
-                        logging.info(f"Module {n} has resizing Lora - force loading")
                         force_load_param(self, "weight", device_to)
                         force_load_param(self, "bias", device_to)
                     else:

From ed15d62a6c6f7ed645e26b5917a242922df1c5a0 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Fri, 15 May 2026 22:26:17 +1000
Subject: [PATCH 21/23] ops: refactor in prep for chunking

---
 comfy/ops.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/comfy/ops.py b/comfy/ops.py
index be744a030b05..4b436f4a7510 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -190,25 +190,25 @@ def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream):
                 else:
                     comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream)
 
-        def handle_pin_miss(m, source, dest, subset="weights", size=None):
-            pin = None
+        def handle_pin(m, pin, source, dest, subset="weights", size=None):
+            if pin is not None:
+                cast_maybe_lowvram_patch([pin], dest, offload_stream)
+                return
             if signature is None:
                 comfy.pinned_memory.pin_memory(m, subset=subset, size=size)
                 pin = comfy.pinned_memory.get_pin(m, subset=subset)
                 if pin is not None:
                     cast_maybe_lowvram_patch(source, pin, None)
-                    return [ pin ]
+                    cast_maybe_lowvram_patch([ pin ], dest, offload_stream)
+                    return
             if pin is None:
                 pin_offset = get_stream_pin_buffer_offset(size)
                 if pin_offset is not None:
                     stream_pin_queue.append((source, pin_offset, size, dest))
-                    return None
-            return source
-
-        if pin is None:
-            xfer_source = handle_pin_miss(s, xfer_source, xfer_dest, size=dest_size)
+                    return
+            cast_maybe_lowvram_patch(source, dest, offload_stream)
 
-        cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream)
+        handle_pin(s, pin, xfer_source, xfer_dest, size=dest_size)
 
         for param_key in ("weight", "bias"):
             lowvram_source = getattr(s, param_key + "_lowvram_function", None)
@@ -219,9 +219,8 @@ def handle_pin_miss(m, source, dest, subset="weights", size=None):
                 lowvram_source.prepare(lowvram_dest, None, copy=False, commit=True)
 
                 pin = comfy.pinned_memory.get_pin(lowvram_source, subset="patches")
-                lowvram_source = handle_pin_miss(lowvram_source, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size) if pin is None else [ pin ]
+                handle_pin(lowvram_source, pin, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size)
 
-                cast_maybe_lowvram_patch(lowvram_source, lowvram_dest, offload_stream)
 
         prefetch["xfer_dest"] = xfer_dest
         prefetch["cast_dest"] = cast_dest

From 43865639a74841dfaada96ca49485f570e231b46 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Sat, 16 May 2026 02:48:51 +1000
Subject: [PATCH 22/23] mm: delegate pin-on-the-way to aimdo

Aimdo is able to chunk and slice this on the way for better CPU->GPU
overlap. The main advantage is the ability to shorten the bus contention
window between previous weight transfer and the next weights vbar
fault.
---
 comfy/memory_management.py | 15 ++++++++++++---
 comfy/model_management.py  |  8 ++++++--
 comfy/ops.py               | 14 ++++++++++----
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/comfy/memory_management.py b/comfy/memory_management.py
index 7645064f59a6..21e3cf59b426 100644
--- a/comfy/memory_management.py
+++ b/comfy/memory_management.py
@@ -15,7 +15,7 @@ class TensorFileSlice(NamedTuple):
     size: int
 
 
-def read_tensor_file_slice_into(tensor, destination):
+def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None):
 
     if isinstance(tensor, QuantizedTensor):
         if not isinstance(destination, QuantizedTensor):
@@ -23,12 +23,17 @@ def read_tensor_file_slice_into(tensor, destination):
         if tensor._layout_cls != destination._layout_cls:
             return False
 
-        if not read_tensor_file_slice_into(tensor._qdata, destination._qdata):
+        if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream,
+                                           destination2=(destination2._qdata if destination2 is not None else None)):
             return False
 
         dst_orig_dtype = destination._params.orig_dtype
         destination._params.copy_from(tensor._params, non_blocking=False)
         destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype)
+        if destination2 is not None:
+            dst_orig_dtype = destination2._params.orig_dtype
+            destination2._params.copy_from(destination._params, non_blocking=True)
+            destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype)
         return True
 
     info = getattr(tensor.untyped_storage(), "_comfy_tensor_file_slice", None)
@@ -50,8 +55,12 @@ def read_tensor_file_slice_into(tensor, destination):
 
     hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None)
     if hostbuf is not None:
+        stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0
+        device_ptr = destination2.data_ptr() if destination2 is not None else 0
         hostbuf.read_file_slice(file_obj, info.offset, info.size,
-                                offset=destination.data_ptr() - hostbuf.get_raw_address())
+                                offset=destination.data_ptr() - hostbuf.get_raw_address(),
+                                stream=stream_ptr,
+                                device_ptr=device_ptr)
         return True
 
     buf_type = ctypes.c_ubyte * info.size
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 19a9163620e6..72ef77ee997c 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1318,7 +1318,7 @@ def sync_stream(device, stream):
     current_stream(device).wait_stream(stream)
 
 
-def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
+def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None):
     wf_context = nullcontext()
     if stream is not None:
        wf_context = stream
@@ -1326,16 +1326,20 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
            wf_context = wf_context.as_context(stream)
 
     dest_views = comfy.memory_management.interpret_gathered_like(tensors, r)
+    dest2_views = comfy.memory_management.interpret_gathered_like(tensors, r2) if r2 is not None else None
     with wf_context:
         for tensor in tensors:
             dest_view = dest_views.pop(0)
+            dest2_view = dest2_views.pop(0) if dest2_views is not None else None
             if tensor is None:
                 continue
-            if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
+            if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view, stream=stream, destination2=dest2_view):
                 continue
             storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
             mark_mmap_dirty(storage)
             dest_view.copy_(tensor, non_blocking=non_blocking)
+            if dest2_view is not None:
+                dest2_view.copy_(dest_view, non_blocking=non_blocking)
 
 
 def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None):
diff --git a/comfy/ops.py b/comfy/ops.py
index 4b436f4a7510..2f364a3a54a6 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -198,8 +198,11 @@ def handle_pin(m, pin, source, dest, subset="weights", size=None):
                 comfy.pinned_memory.pin_memory(m, subset=subset, size=size)
                 pin = comfy.pinned_memory.get_pin(m, subset=subset)
                 if pin is not None:
-                    cast_maybe_lowvram_patch(source, pin, None)
-                    cast_maybe_lowvram_patch([ pin ], dest, offload_stream)
+                    if isinstance(source, list):
+                        comfy.model_management.cast_to_gathered(source, pin, non_blocking=non_blocking, stream=offload_stream, r2=dest)
+                    else:
+                        cast_maybe_lowvram_patch(source, pin, None)
+                        cast_maybe_lowvram_patch([ pin ], dest, offload_stream)
                     return
             if pin is None:
                 pin_offset = get_stream_pin_buffer_offset(size)
@@ -238,8 +241,11 @@ def handle_pin(m, pin, source, dest, subset="weights", size=None):
         stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf
         for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue:
             pin = stream_pin_tensor[pin_offset:pin_offset + pin_size]
-            cast_maybe_lowvram_patch(xfer_source, pin, None)
-            comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+            if isinstance(xfer_source, list):
+                comfy.model_management.cast_to_gathered(xfer_source, pin, non_blocking=non_blocking, stream=offload_stream, r2=xfer_dest)
+            else:
+                cast_maybe_lowvram_patch(xfer_source, pin, None)
+                comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream)
         stream_pin_hostbuf._comfy_event = offload_stream.record_event()
 
     return offload_stream

From 52a68b9b1c11b177ef62dce9eef8cdc1ac9d1db4 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Sat, 16 May 2026 02:56:24 +1000
Subject: [PATCH 23/23] bump aimdo

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6754c94c4226..193d60cf04d9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
-comfy-aimdo==0.4.1
+comfy-aimdo==0.4.2
 requests
 simpleeval>=1.0.0
 blake3