From 1fe3a13f8476f333cb825e0b4a7f436a27684f36 Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 21:08:15 +1000 Subject: [PATCH 01/23] model_management: disable non-dynamic smart memory Disable smart memory outright for non dynamic models. This is a minor step towards deprecation of --disable-dynamic-vram and the legacy ModelPatcher. This is needed for estimate-free model development, where new models can opt-out of supplying a memory estimate and not have to worry about hard VRAM allocations due to legacy non-dynamic model patchers This is also a general stability increase for a lot of stray use cases where estimates may still be off and going forward we are not going to accurately maintain such estimates. --- comfy/model_management.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 21738a4c7816..ebef03ceb62a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -674,10 +674,10 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins i = x[-1] memory_to_free = 1e32 pins_to_free = 1e32 - if not DISABLE_SMART_MEMORY or device is None: + if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None): memory_to_free = 0 if device is None else memory_required - get_free_memory(device) pins_to_free = pins_required - get_free_ram() - if current_loaded_models[i].model.is_dynamic() and for_dynamic: + if for_dynamic: #don't actually unload dynamic models for the sake of other dynamic models #as that works on-demand. memory_required -= current_loaded_models[i].model.loaded_size() From 157965a1c99792e6250e6027ba2045efdd148528 Mon Sep 17 00:00:00 2001 From: Rattus Date: Mon, 4 May 2026 12:32:12 +1000 Subject: [PATCH 02/23] pinned_memory: implement with aimdo growable buffer Use a single growable buffer so we can do threaded pre-warming on pinned memory. --- comfy/model_patcher.py | 34 +++++++++++++++++++++------------- comfy/pinned_memory.py | 30 ++++++++++-------------------- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 2ea14bc2c8a4..dc5f0e577ec9 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -35,6 +35,7 @@ import comfy.ops import comfy.patcher_extension import comfy.utils +import comfy_aimdo.host_buffer from comfy.comfy_types import UnetWrapperFunction from comfy.quant_ops import QuantizedTensor from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP @@ -1543,6 +1544,10 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up super().__init__(model, load_device, offload_device, size, weight_inplace_update) if not hasattr(self.model, "dynamic_vbars"): self.model.dynamic_vbars = {} + if not hasattr(self.model, "dynamic_pins"): + self.model.dynamic_pins = {} + if self.load_device not in self.model.dynamic_pins: + self.model.dynamic_pins[self.load_device] = {"hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), "stack": [], "failed": False} self.non_dynamic_delegate_model = None assert load_device is not None @@ -1604,6 +1609,8 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False self.unpatch_hooks() vbar = self._vbar_get(create=True) + pin_state = self.model.dynamic_pins[self.load_device] + pin_state["failed"] = False if vbar is not None: vbar.prioritize() @@ -1655,8 +1662,8 @@ def force_load_param(self, param_key, device_to): if hasattr(m, "comfy_cast_weights"): m.comfy_cast_weights = True - m.pin_failed = False m.seed_key = n + m._pin_state = pin_state set_dirty(m, dirty) force_load, v_weight_size = setup_param(self, m, n, "weight") @@ -1734,20 +1741,21 @@ def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=Fals return freed def pinned_memory_size(self): - total = 0 - loading = self._load_list(for_dynamic=True) - for x in loading: - _, _, _, _, m, _ = x - pin = comfy.pinned_memory.get_pin(m) - if pin is not None: - total += pin.numel() * pin.element_size() - return total + return self.model.dynamic_pins[self.load_device]["hostbuf"].size def partially_unload_ram(self, ram_to_unload): - loading = self._load_list(for_dynamic=True, default_device=self.offload_device) - for x in loading: - *_, m, _ = x - ram_to_unload -= comfy.pinned_memory.unpin_memory(m) + pin_state = self.model.dynamic_pins[self.load_device] + hostbuf = pin_state["hostbuf"] + stack = self.model.dynamic_pins[self.load_device]["stack"] + while len(stack) > 0: + module, offset = stack.pop() + size = module._pin.numel() * module._pin.element_size() + del module._pin + hostbuf.truncate(offset) + comfy.model_management.TOTAL_PINNED_MEMORY -= size + if comfy.model_management.TOTAL_PINNED_MEMORY < 0: + comfy.model_management.TOTAL_PINNED_MEMORY = 0 + ram_to_unload -= size if ram_to_unload <= 0: return diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 6d3ba367a798..3638066c8825 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -9,35 +9,25 @@ def get_pin(module): return getattr(module, "_pin", None) def pin_memory(module): - if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None: + pin_state = module._pin_state + if pin_state["failed"] or args.disable_pinned_memory or get_pin(module) is not None: return + hostbuf = pin_state["hostbuf"] size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) - + offset = hostbuf.size if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY: - module.pin_failed = True + pin_state["failed"] = True return False try: - hostbuf = comfy_aimdo.host_buffer.HostBuffer(size) + hostbuf.extend(size=size) except RuntimeError: - module.pin_failed = True + pin_state["failed"] = True return False - module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf) - module._pin_hostbuf = hostbuf + module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] + module._pin.untyped_storage()._comfy_hostbuf = hostbuf + pin_state["stack"].append((module, offset)) comfy.model_management.TOTAL_PINNED_MEMORY += size return True - -def unpin_memory(module): - if get_pin(module) is None: - return 0 - size = module._pin.numel() * module._pin.element_size() - - comfy.model_management.TOTAL_PINNED_MEMORY -= size - if comfy.model_management.TOTAL_PINNED_MEMORY < 0: - comfy.model_management.TOTAL_PINNED_MEMORY = 0 - - del module._pin - del module._pin_hostbuf - return size From b66b6420681a83f5bd247dec42d95f96113503d7 Mon Sep 17 00:00:00 2001 From: Rattus Date: Mon, 4 May 2026 12:47:28 +1000 Subject: [PATCH 03/23] mm: use aimdo to do transfer from disk to pin Aimdo implements a faster threaded loader. --- comfy/memory_management.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/comfy/memory_management.py b/comfy/memory_management.py index 48e3c11da9b6..4a628b05c18f 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -48,6 +48,12 @@ def read_tensor_file_slice_into(tensor, destination): if info.size == 0: return True + hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None) + if hostbuf is not None: + hostbuf.read_file_slice(file_obj, info.offset, info.size, + offset=destination.data_ptr() - hostbuf.get_raw_address()) + return True + buf_type = ctypes.c_ubyte * info.size view = memoryview(buf_type.from_address(destination.data_ptr())) From 8070cb77809145e7cf24b94eeb7f55710cdfcd17 Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 14:04:48 +1000 Subject: [PATCH 04/23] Add stream host pin buffer for AIMDO casts Introduce per-offload-stream HostBuffer reuse for pinned staging, include it in cast buffer reset synchronization. Defer actual casts that go via this pin path to a separate pass such that the buffer can be allocated monolithically (to avoid cudaHostRegister thrash). --- comfy/model_management.py | 18 ++++++++++-- comfy/ops.py | 58 +++++++++++++++++++++++++++++++-------- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index ebef03ceb62a..facdd0873d7a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -31,6 +31,7 @@ import comfy.memory_management import comfy.utils import comfy.quant_ops +import comfy_aimdo.host_buffer import comfy_aimdo.vram_buffer class VRAMState(Enum): @@ -1180,8 +1181,10 @@ def current_stream(device): LARGEST_CASTED_WEIGHT = (None, 0) STREAM_AIMDO_CAST_BUFFERS = {} LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) +STREAM_PIN_BUFFERS = {} DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3 +DEFAULT_PIN_BUFFER_PRIME_SIZE = 1024 ** 2 def get_cast_buffer(offload_stream, device, size, ref): global LARGEST_CASTED_WEIGHT @@ -1220,21 +1223,32 @@ def get_aimdo_cast_buffer(offload_stream, device): if cast_buffer is None: cast_buffer = comfy_aimdo.vram_buffer.VRAMBuffer(DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE, device.index) STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer - return cast_buffer + +def get_pin_buffer(offload_stream): + pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None) + if pin_buffer is None: + # A small non-zero default primes HostBuffer's larger virtual reservation. + pin_buffer = comfy_aimdo.host_buffer.HostBuffer(DEFAULT_PIN_BUFFER_PRIME_SIZE) + STREAM_PIN_BUFFERS[offload_stream] = pin_buffer + elif offload_stream is not None: + offload_stream.synchronize() + return pin_buffer + def reset_cast_buffers(): global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT LARGEST_CASTED_WEIGHT = (None, 0) LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) - for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS): + for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS): if offload_stream is not None: offload_stream.synchronize() synchronize() STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() + STREAM_PIN_BUFFERS.clear() soft_empty_cache() def get_offload_stream(device): diff --git a/comfy/ops.py b/comfy/ops.py index 77ad1d5276da..3d196f43877a 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -75,6 +75,8 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs): cast_to = comfy.model_management.cast_to #TODO: remove once no more references +STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024 + def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -91,6 +93,9 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin offload_stream = None cast_buffer = None cast_buffer_offset = 0 + stream_pin_hostbuf = None + stream_pin_offset = 0 + stream_pin_queue = [] def ensure_offload_stream(module, required_size, check_largest): nonlocal offload_stream @@ -124,6 +129,20 @@ def get_cast_buffer(buffer_size): cast_buffer_offset += buffer_size return buffer + def get_stream_pin_buffer_offset(buffer_size): + nonlocal stream_pin_hostbuf + nonlocal stream_pin_offset + + if buffer_size == 0 or offload_stream is None: + return None + + if stream_pin_hostbuf is None: + stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream) + + offset = stream_pin_offset + stream_pin_offset += buffer_size + return offset + for s in comfy_modules: signature = comfy_aimdo.model_vbar.vbar_fault(s._v) resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature) @@ -162,17 +181,21 @@ def get_cast_buffer(buffer_size): if xfer_dest is None: xfer_dest = get_cast_buffer(dest_size) - if signature is None and pin is None: - comfy.pinned_memory.pin_memory(s) - pin = comfy.pinned_memory.get_pin(s) - else: - pin = None - - if pin is not None: - comfy.model_management.cast_to_gathered(xfer_source, pin) - xfer_source = [ pin ] - #send it over - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + if pin is None: + if signature is None: + comfy.pinned_memory.pin_memory(s) + pin = comfy.pinned_memory.get_pin(s) + if pin is not None: + comfy.model_management.cast_to_gathered(xfer_source, pin) + xfer_source = [ pin ] + if pin is None: + pin_offset = get_stream_pin_buffer_offset(dest_size) + if pin_offset is not None: + stream_pin_queue.append((xfer_source, pin_offset, dest_size, xfer_dest)) + xfer_source = None + + if xfer_source is not None: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) for param_key in ("weight", "bias"): lowvram_fn = getattr(s, param_key + "_lowvram_function", None) @@ -186,6 +209,19 @@ def get_cast_buffer(buffer_size): prefetch["needs_cast"] = needs_cast s._prefetch = prefetch + if stream_pin_offset > 0: + stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size) + if stream_pin_hostbuf_size < stream_pin_offset: + stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM + stream_pin_hostbuf.extend(size=stream_pin_hostbuf_size, reallocate=True) + stream_pin_hostbuf._comfy_stream_pin_size = stream_pin_hostbuf_size + stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset) + stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf + for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: + pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] + comfy.model_management.cast_to_gathered(xfer_source, pin) + comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) + return offload_stream From 17955235b2d95e9e8aa6f9719bcc1a29d8976ceb Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 18:33:02 +1000 Subject: [PATCH 05/23] remove old pin path --- comfy/model_management.py | 74 +++++++-------------------------------- comfy/model_patcher.py | 3 -- comfy/utils.py | 2 -- comfy/windows.py | 52 --------------------------- 4 files changed, 13 insertions(+), 118 deletions(-) delete mode 100644 comfy/windows.py diff --git a/comfy/model_management.py b/comfy/model_management.py index facdd0873d7a..4b96d1492e4b 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -496,6 +496,8 @@ def get_torch_device_name(device): current_loaded_models = [] +DIRTY_MMAPS = set() + def module_size(module): module_mem = 0 sd = module.state_dict() @@ -504,27 +506,11 @@ def module_size(module): module_mem += t.nbytes return module_mem -def module_mmap_residency(module, free=False): - mmap_touched_mem = 0 - module_mem = 0 - bounced_mmaps = set() - sd = module.state_dict() - for k in sd: - t = sd[k] - module_mem += t.nbytes - storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage() - if not getattr(storage, "_comfy_tensor_mmap_touched", False): - continue - mmap_touched_mem += t.nbytes - if not free: - continue - storage._comfy_tensor_mmap_touched = False - mmap_obj = storage._comfy_tensor_mmap_refs[0] - if mmap_obj in bounced_mmaps: - continue - mmap_obj.bounce() - bounced_mmaps.add(mmap_obj) - return mmap_touched_mem, module_mem +def mark_mmap_dirty(storage): + mmap_refs = getattr(storage, "_comfy_tensor_mmap_refs", None) + if mmap_refs is not None: + DIRTY_MMAPS.add(mmap_refs[0]) + class LoadedModel: def __init__(self, model): @@ -554,9 +540,6 @@ def model(self): def model_memory(self): return self.model.model_size() - def model_mmap_residency(self, free=False): - return self.model.model_mmap_residency(free=free) - def model_loaded_memory(self): return self.model.loaded_size() @@ -636,15 +619,9 @@ def offloaded_memory(loaded_models, device): EXTRA_RESERVED_VRAM = 400 * 1024 * 1024 if WINDOWS: - import comfy.windows EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue if total_vram > (15 * 1024): # more extra reserved vram on 16GB+ cards EXTRA_RESERVED_VRAM += 100 * 1024 * 1024 - def get_free_ram(): - return comfy.windows.get_free_ram() -else: - def get_free_ram(): - return psutil.virtual_memory().available if args.reserve_vram is not None: EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024 @@ -658,7 +635,6 @@ def minimum_inference_memory(): def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins_required=0, ram_required=0): cleanup_models_gc() - comfy.memory_management.extra_ram_release(max(pins_required, ram_required)) unloaded_model = [] can_unload = [] unloaded_models = [] @@ -674,10 +650,8 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins for x in can_unload_sorted: i = x[-1] memory_to_free = 1e32 - pins_to_free = 1e32 if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None): memory_to_free = 0 if device is None else memory_required - get_free_memory(device) - pins_to_free = pins_required - get_free_ram() if for_dynamic: #don't actually unload dynamic models for the sake of other dynamic models #as that works on-demand. @@ -686,18 +660,6 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins if memory_to_free > 0 and current_loaded_models[i].model_unload(memory_to_free): logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}") unloaded_model.append(i) - if pins_to_free > 0: - logging.debug(f"PIN Unloading {current_loaded_models[i].model.model.__class__.__name__}") - current_loaded_models[i].model.partially_unload_ram(pins_to_free) - - for x in can_unload_sorted: - i = x[-1] - ram_to_free = ram_required - psutil.virtual_memory().available - if ram_to_free <= 0 and i not in unloaded_model: - continue - resident_memory, _ = current_loaded_models[i].model_mmap_residency(free=True) - if resident_memory > 0: - logging.debug(f"RAM Unloading {current_loaded_models[i].model.model.__class__.__name__}") for i in sorted(unloaded_model, reverse=True): unloaded_models.append(current_loaded_models.pop(i)) @@ -763,29 +725,16 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu model_to_unload.model.detach(unpatch_all=False) model_to_unload.model_finalizer.detach() - total_memory_required = {} - total_pins_required = {} - total_ram_required = {} for loaded_model in models_to_load: device = loaded_model.device total_memory_required[device] = total_memory_required.get(device, 0) + loaded_model.model_memory_required(device) - resident_memory, model_memory = loaded_model.model_mmap_residency() - pinned_memory = loaded_model.model.pinned_memory_size() - #FIXME: This can over-free the pins as it budgets to pin the entire model. We should - #make this JIT to keep as much pinned as possible. - pins_required = model_memory - pinned_memory - ram_required = model_memory - resident_memory - total_pins_required[device] = total_pins_required.get(device, 0) + pins_required - total_ram_required[device] = total_ram_required.get(device, 0) + ram_required for device in total_memory_required: if device != torch.device("cpu"): free_memory(total_memory_required[device] * 1.1 + extra_mem, device, - for_dynamic=free_for_dynamic, - pins_required=total_pins_required[device], - ram_required=total_ram_required[device]) + for_dynamic=free_for_dynamic) for device in total_memory_required: if device != torch.device("cpu"): @@ -1246,6 +1195,10 @@ def reset_cast_buffers(): offload_stream.synchronize() synchronize() + for mmap_obj in DIRTY_MMAPS: + mmap_obj.bounce() + + DIRTY_MMAPS.clear() STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() STREAM_PIN_BUFFERS.clear() @@ -1310,8 +1263,7 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None): if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view): continue storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() - if hasattr(storage, "_comfy_tensor_mmap_touched"): - storage._comfy_tensor_mmap_touched = True + mark_mmap_dirty(storage) dest_view.copy_(tensor, non_blocking=non_blocking) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index dc5f0e577ec9..43712c7a0359 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -342,9 +342,6 @@ def model_size(self): self.size = comfy.model_management.module_size(self.model) return self.size - def model_mmap_residency(self, free=False): - return comfy.model_management.module_mmap_residency(self.model, free=free) - def loaded_size(self): return self.model.model_loaded_weight_memory diff --git a/comfy/utils.py b/comfy/utils.py index b759720274f1..fabe18b510cd 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -113,7 +113,6 @@ def load_safetensors(ckpt): "_comfy_tensor_file_slice", comfy.memory_management.TensorFileSlice(f, threading.get_ident(), data_base_offset + start, end - start)) setattr(storage, "_comfy_tensor_mmap_refs", (model_mmap, mv)) - setattr(storage, "_comfy_tensor_mmap_touched", False) sd[name] = tensor return sd, header.get("__metadata__", {}), @@ -1445,4 +1444,3 @@ def deepcopy_list_dict(obj, memo=None): memo[obj_id] = res return res - diff --git a/comfy/windows.py b/comfy/windows.py deleted file mode 100644 index 213dc481d937..000000000000 --- a/comfy/windows.py +++ /dev/null @@ -1,52 +0,0 @@ -import ctypes -import logging -import psutil -from ctypes import wintypes - -import comfy_aimdo.control - -psapi = ctypes.WinDLL("psapi") -kernel32 = ctypes.WinDLL("kernel32") - -class PERFORMANCE_INFORMATION(ctypes.Structure): - _fields_ = [ - ("cb", wintypes.DWORD), - ("CommitTotal", ctypes.c_size_t), - ("CommitLimit", ctypes.c_size_t), - ("CommitPeak", ctypes.c_size_t), - ("PhysicalTotal", ctypes.c_size_t), - ("PhysicalAvailable", ctypes.c_size_t), - ("SystemCache", ctypes.c_size_t), - ("KernelTotal", ctypes.c_size_t), - ("KernelPaged", ctypes.c_size_t), - ("KernelNonpaged", ctypes.c_size_t), - ("PageSize", ctypes.c_size_t), - ("HandleCount", wintypes.DWORD), - ("ProcessCount", wintypes.DWORD), - ("ThreadCount", wintypes.DWORD), - ] - -def get_free_ram(): - #Windows is way too conservative and chalks recently used uncommitted model RAM - #as "in-use". So, calculate free RAM for the sake of general use as the greater of: - # - #1: What psutil says - #2: Total Memory - (Committed Memory - VRAM in use) - # - #We have to subtract VRAM in use from the comitted memory as WDDM creates a naked - #commit charge for all VRAM used just incase it wants to page it all out. This just - #isn't realistic so "overcommit" on our calculations by just subtracting it off. - - pi = PERFORMANCE_INFORMATION() - pi.cb = ctypes.sizeof(pi) - - if not psapi.GetPerformanceInfo(ctypes.byref(pi), pi.cb): - logging.warning("WARNING: Failed to query windows performance info. RAM usage may be sub optimal") - return psutil.virtual_memory().available - - committed = pi.CommitTotal * pi.PageSize - total = pi.PhysicalTotal * pi.PageSize - - return max(psutil.virtual_memory().available, - total - (committed - comfy_aimdo.control.get_total_vram_usage())) - From 8187cd783e20ac71cbf88d51338d632996838cb3 Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 19:55:41 +1000 Subject: [PATCH 06/23] Implement JIT pinned memory pressure Replace the predictive pin pressure mechanism with JIT PIN memory pressure. --- comfy/model_management.py | 52 ++++++++++++++++++++++++++++++++++----- comfy/model_patcher.py | 17 ++++++++++--- comfy/ops.py | 8 ++++-- comfy/pinned_memory.py | 4 +-- 4 files changed, 66 insertions(+), 15 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 4b96d1492e4b..6a2126cb59bb 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -498,6 +498,8 @@ def get_torch_device_name(device): DIRTY_MMAPS = set() +PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024 + def module_size(module): module_mem = 0 sd = module.state_dict() @@ -511,6 +513,21 @@ def mark_mmap_dirty(storage): if mmap_refs is not None: DIRTY_MMAPS.add(mmap_refs[0]) +def ensure_pin_budget(size, evict_active=False): + if MAX_PINNED_MEMORY <= 0: + return + + shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + if shortfall <= 0: + return + + shortfall += PIN_PRESSURE_HYSTERESIS + for loaded_model in reversed(current_loaded_models): + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.dynamic_pins[model.load_device]["active"]): + shortfall -= model.partially_unload_ram(shortfall) + if shortfall <= 0: + break class LoadedModel: def __init__(self, model): @@ -1133,7 +1150,6 @@ def current_stream(device): STREAM_PIN_BUFFERS = {} DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3 -DEFAULT_PIN_BUFFER_PRIME_SIZE = 1024 ** 2 def get_cast_buffer(offload_stream, device, size, ref): global LARGEST_CASTED_WEIGHT @@ -1177,14 +1193,29 @@ def get_aimdo_cast_buffer(offload_stream, device): def get_pin_buffer(offload_stream): pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None) if pin_buffer is None: - # A small non-zero default primes HostBuffer's larger virtual reservation. - pin_buffer = comfy_aimdo.host_buffer.HostBuffer(DEFAULT_PIN_BUFFER_PRIME_SIZE) + pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0) STREAM_PIN_BUFFERS[offload_stream] = pin_buffer elif offload_stream is not None: offload_stream.synchronize() return pin_buffer +def resize_pin_buffer(pin_buffer, size): + global TOTAL_PINNED_MEMORY + old_size = getattr(pin_buffer, "_comfy_stream_pin_size", 0) + if size <= old_size: + return True + growth = size - old_size + ensure_pin_budget(growth, evict_active=True) + try: + pin_buffer.extend(size=size, reallocate=True) + except RuntimeError: + return False + pin_buffer._comfy_stream_pin_size = size + TOTAL_PINNED_MEMORY += growth + return True + def reset_cast_buffers(): + global TOTAL_PINNED_MEMORY global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT @@ -1197,8 +1228,18 @@ def reset_cast_buffers(): for mmap_obj in DIRTY_MMAPS: mmap_obj.bounce() - DIRTY_MMAPS.clear() + + for pin_buffer in STREAM_PIN_BUFFERS.values(): + TOTAL_PINNED_MEMORY -= getattr(pin_buffer, "_comfy_stream_pin_size", 0) + if TOTAL_PINNED_MEMORY < 0: + TOTAL_PINNED_MEMORY = 0 + + for loaded_model in current_loaded_models: + model = loaded_model.model + if model is not None and model.is_dynamic(): + model.dynamic_pins[model.load_device]["active"] = False + STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() STREAM_PIN_BUFFERS.clear() @@ -1344,8 +1385,7 @@ def pin_memory(tensor): return False size = tensor.nbytes - if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY: - return False + ensure_pin_budget(size) ptr = tensor.data_ptr() if ptr == 0: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 43712c7a0359..def0901dcd4c 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1117,7 +1117,7 @@ def pinned_memory_size(self): return 0 def partially_unload_ram(self, ram_to_unload): - pass + return 0 def detach(self, unpatch_all=True): self.eject_model() @@ -1544,7 +1544,12 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up if not hasattr(self.model, "dynamic_pins"): self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: - self.model.dynamic_pins[self.load_device] = {"hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), "stack": [], "failed": False} + self.model.dynamic_pins[self.load_device] = { + "hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), + "stack": [], + "failed": False, + "active": False, + } self.non_dynamic_delegate_model = None assert load_device is not None @@ -1608,6 +1613,7 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False vbar = self._vbar_get(create=True) pin_state = self.model.dynamic_pins[self.load_device] pin_state["failed"] = False + pin_state["active"] = True if vbar is not None: vbar.prioritize() @@ -1741,9 +1747,10 @@ def pinned_memory_size(self): return self.model.dynamic_pins[self.load_device]["hostbuf"].size def partially_unload_ram(self, ram_to_unload): + freed = 0 pin_state = self.model.dynamic_pins[self.load_device] hostbuf = pin_state["hostbuf"] - stack = self.model.dynamic_pins[self.load_device]["stack"] + stack = pin_state["stack"] while len(stack) > 0: module, offset = stack.pop() size = module._pin.numel() * module._pin.element_size() @@ -1752,9 +1759,11 @@ def partially_unload_ram(self, ram_to_unload): comfy.model_management.TOTAL_PINNED_MEMORY -= size if comfy.model_management.TOTAL_PINNED_MEMORY < 0: comfy.model_management.TOTAL_PINNED_MEMORY = 0 + freed += size ram_to_unload -= size if ram_to_unload <= 0: - return + return freed + return freed def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False): #This isn't used by the core at all and can only be to load a model out of diff --git a/comfy/ops.py b/comfy/ops.py index 3d196f43877a..ee3184894305 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -138,6 +138,8 @@ def get_stream_pin_buffer_offset(buffer_size): if stream_pin_hostbuf is None: stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream) + if stream_pin_hostbuf is None: + return None offset = stream_pin_offset stream_pin_offset += buffer_size @@ -213,8 +215,10 @@ def get_stream_pin_buffer_offset(buffer_size): stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size) if stream_pin_hostbuf_size < stream_pin_offset: stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM - stream_pin_hostbuf.extend(size=stream_pin_hostbuf_size, reallocate=True) - stream_pin_hostbuf._comfy_stream_pin_size = stream_pin_hostbuf_size + if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_hostbuf_size): + for xfer_source, _, _, xfer_dest in stream_pin_queue: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + return offload_stream stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset) stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 3638066c8825..a35759aad382 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -16,9 +16,7 @@ def pin_memory(module): hostbuf = pin_state["hostbuf"] size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size - if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY: - pin_state["failed"] = True - return False + comfy.model_management.ensure_pin_budget(size) try: hostbuf.extend(size=size) From 2b927e17838b733dde6a660fe521bb0f13768528 Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 23:50:37 +1000 Subject: [PATCH 07/23] LowVRAMPatch: change to two-phase visit --- comfy/lora.py | 19 +++++++++++++------ comfy/model_patcher.py | 11 +++++++++-- comfy/ops.py | 2 +- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/comfy/lora.py b/comfy/lora.py index db8f16bcb5ae..f7c7c21a5847 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -475,16 +475,23 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori return weight -def prefetch_prepared_value(value, allocate_buffer, stream): +def prefetch_prepared_value(value, counter, destination, stream): if isinstance(value, torch.Tensor): - dest = allocate_buffer(comfy.memory_management.vram_aligned_size(value)) - comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream) + size = comfy.memory_management.vram_aligned_size(value) + offset = counter[0] + counter[0] += size + if destination is None: + return value + + dest = destination[offset:offset + size] + if stream is not None: + comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream) return comfy.memory_management.interpret_gathered_like([value], dest)[0] elif isinstance(value, weight_adapter.WeightAdapterBase): - return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, allocate_buffer, stream)) + return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream)) elif isinstance(value, tuple): - return tuple(prefetch_prepared_value(item, allocate_buffer, stream) for item in value) + return tuple(prefetch_prepared_value(item, counter, destination, stream) for item in value) elif isinstance(value, list): - return [prefetch_prepared_value(item, allocate_buffer, stream) for item in value] + return [prefetch_prepared_value(item, counter, destination, stream) for item in value] return value diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index def0901dcd4c..dc58cd42ebcc 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -125,9 +125,16 @@ def __init__(self, key, patches, convert_func=None, set_func=None): self.set_func = set_func self.prepared_patches = None - def prepare(self, allocate_buffer, stream): + def memory_required(self): + counter = [0] + for patch in self.patches[self.key]: + comfy.lora.prefetch_prepared_value(patch[1], counter, None, None) + return counter[0] + + def prepare(self, destination, stream): + counter = [0] self.prepared_patches = [ - (patch[0], comfy.lora.prefetch_prepared_value(patch[1], allocate_buffer, stream), patch[2], patch[3], patch[4]) + (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream), patch[2], patch[3], patch[4]) for patch in self.patches[self.key] ] diff --git a/comfy/ops.py b/comfy/ops.py index ee3184894305..bd3de3677818 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -203,7 +203,7 @@ def get_stream_pin_buffer_offset(buffer_size): lowvram_fn = getattr(s, param_key + "_lowvram_function", None) if lowvram_fn is not None: ensure_offload_stream(s, cast_buffer_offset, False) - lowvram_fn.prepare(lambda size: get_cast_buffer(size), offload_stream) + lowvram_fn.prepare(get_cast_buffer(lowvram_fn.memory_required()), offload_stream) prefetch["xfer_dest"] = xfer_dest prefetch["cast_dest"] = cast_dest From 8e473d756f39c5cac5397a8c3b4442e75617068c Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 8 May 2026 08:19:47 +1000 Subject: [PATCH 08/23] lora: re-implement as inplace swiss-army-knife operation --- comfy/lora.py | 10 +++++----- comfy/model_management.py | 7 +++---- comfy/model_patcher.py | 13 +++++++++---- comfy/ops.py | 25 ++++++++++++++++++------- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/comfy/lora.py b/comfy/lora.py index f7c7c21a5847..2b8699710612 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -475,7 +475,7 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori return weight -def prefetch_prepared_value(value, counter, destination, stream): +def prefetch_prepared_value(value, counter, destination, stream, copy): if isinstance(value, torch.Tensor): size = comfy.memory_management.vram_aligned_size(value) offset = counter[0] @@ -484,14 +484,14 @@ def prefetch_prepared_value(value, counter, destination, stream): return value dest = destination[offset:offset + size] - if stream is not None: + if copy: comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream) return comfy.memory_management.interpret_gathered_like([value], dest)[0] elif isinstance(value, weight_adapter.WeightAdapterBase): - return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream)) + return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream, copy)) elif isinstance(value, tuple): - return tuple(prefetch_prepared_value(item, counter, destination, stream) for item in value) + return tuple(prefetch_prepared_value(item, counter, destination, stream, copy) for item in value) elif isinstance(value, list): - return [prefetch_prepared_value(item, counter, destination, stream) for item in value] + return [prefetch_prepared_value(item, counter, destination, stream, copy) for item in value] return value diff --git a/comfy/model_management.py b/comfy/model_management.py index 6a2126cb59bb..40f72fa1bd86 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1201,7 +1201,7 @@ def get_pin_buffer(offload_stream): def resize_pin_buffer(pin_buffer, size): global TOTAL_PINNED_MEMORY - old_size = getattr(pin_buffer, "_comfy_stream_pin_size", 0) + old_size = pin_buffer.size if size <= old_size: return True growth = size - old_size @@ -1210,8 +1210,7 @@ def resize_pin_buffer(pin_buffer, size): pin_buffer.extend(size=size, reallocate=True) except RuntimeError: return False - pin_buffer._comfy_stream_pin_size = size - TOTAL_PINNED_MEMORY += growth + TOTAL_PINNED_MEMORY += pin_buffer.size - old_size return True def reset_cast_buffers(): @@ -1231,7 +1230,7 @@ def reset_cast_buffers(): DIRTY_MMAPS.clear() for pin_buffer in STREAM_PIN_BUFFERS.values(): - TOTAL_PINNED_MEMORY -= getattr(pin_buffer, "_comfy_stream_pin_size", 0) + TOTAL_PINNED_MEMORY -= pin_buffer.size if TOTAL_PINNED_MEMORY < 0: TOTAL_PINNED_MEMORY = 0 diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index dc58cd42ebcc..a88603df95f7 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -118,6 +118,8 @@ def string_to_seed(data): return comfy.utils.string_to_seed(data) class LowVramPatch: + is_lowvram_patch = True + def __init__(self, key, patches, convert_func=None, set_func=None): self.key = key self.patches = patches @@ -128,15 +130,18 @@ def __init__(self, key, patches, convert_func=None, set_func=None): def memory_required(self): counter = [0] for patch in self.patches[self.key]: - comfy.lora.prefetch_prepared_value(patch[1], counter, None, None) + comfy.lora.prefetch_prepared_value(patch[1], counter, None, None, False) return counter[0] - def prepare(self, destination, stream): + def prepare(self, destination, stream, copy=True, commit=True): counter = [0] - self.prepared_patches = [ - (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream), patch[2], patch[3], patch[4]) + prepared_patches = [ + (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream, copy), patch[2], patch[3], patch[4]) for patch in self.patches[self.key] ] + if commit: + self.prepared_patches = prepared_patches + return prepared_patches def clear_prepared(self): self.prepared_patches = None diff --git a/comfy/ops.py b/comfy/ops.py index bd3de3677818..8603b50a66c7 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -203,7 +203,14 @@ def get_stream_pin_buffer_offset(buffer_size): lowvram_fn = getattr(s, param_key + "_lowvram_function", None) if lowvram_fn is not None: ensure_offload_stream(s, cast_buffer_offset, False) - lowvram_fn.prepare(get_cast_buffer(lowvram_fn.memory_required()), offload_stream) + lowvram_size = lowvram_fn.memory_required() + lowvram_dest = get_cast_buffer(lowvram_size) + lowvram_fn.prepare(lowvram_dest, None, copy=False, commit=True) + pin_offset = get_stream_pin_buffer_offset(lowvram_size) + if pin_offset is not None: + stream_pin_queue.append((lowvram_fn, pin_offset, lowvram_size, lowvram_dest)) + else: + lowvram_fn.prepare(lowvram_dest, offload_stream, copy=True, commit=True) prefetch["xfer_dest"] = xfer_dest prefetch["cast_dest"] = cast_dest @@ -211,19 +218,23 @@ def get_stream_pin_buffer_offset(buffer_size): prefetch["needs_cast"] = needs_cast s._prefetch = prefetch + def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): + if getattr(xfer_source, "is_lowvram_patch", False): + xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) + else: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) + if stream_pin_offset > 0: - stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size) - if stream_pin_hostbuf_size < stream_pin_offset: - stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM - if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_hostbuf_size): + if stream_pin_hostbuf.size < stream_pin_offset: + if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM): for xfer_source, _, _, xfer_dest in stream_pin_queue: - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) return offload_stream stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset) stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] - comfy.model_management.cast_to_gathered(xfer_source, pin) + cast_maybe_lowvram_patch(xfer_source, pin, None) comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) return offload_stream From e48dace1452df67a3661bcf6d5144e4a7aa8f867 Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 8 May 2026 09:47:08 +1000 Subject: [PATCH 09/23] prepare for multiple pin sets --- comfy/model_management.py | 2 ++ comfy/model_patcher.py | 37 +++++++++++++++++++------------------ comfy/pinned_memory.py | 13 +++++++------ 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 40f72fa1bd86..ca4318a4535e 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1238,6 +1238,8 @@ def reset_cast_buffers(): model = loaded_model.model if model is not None and model.is_dynamic(): model.dynamic_pins[model.load_device]["active"] = False + model.partially_unload_ram(1e30, subsets=[ "patches" ]) + model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0), []) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index a88603df95f7..530db214cc9c 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1557,8 +1557,8 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: self.model.dynamic_pins[self.load_device] = { - "hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), - "stack": [], + "weights": (comfy_aimdo.host_buffer.HostBuffer(0), []), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0), []), "failed": False, "active": False, } @@ -1756,25 +1756,26 @@ def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=Fals return freed def pinned_memory_size(self): - return self.model.dynamic_pins[self.load_device]["hostbuf"].size + return (self.model.dynamic_pins[self.load_device]["weights"][0].size + + self.model.dynamic_pins[self.load_device]["patches"][0].size) - def partially_unload_ram(self, ram_to_unload): + def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] - hostbuf = pin_state["hostbuf"] - stack = pin_state["stack"] - while len(stack) > 0: - module, offset = stack.pop() - size = module._pin.numel() * module._pin.element_size() - del module._pin - hostbuf.truncate(offset) - comfy.model_management.TOTAL_PINNED_MEMORY -= size - if comfy.model_management.TOTAL_PINNED_MEMORY < 0: - comfy.model_management.TOTAL_PINNED_MEMORY = 0 - freed += size - ram_to_unload -= size - if ram_to_unload <= 0: - return freed + for subset in subsets: + hostbuf, stack = pin_state[subset] + while len(stack) > 0: + module, offset = stack.pop() + size = module._pin.numel() * module._pin.element_size() + del module._pin + hostbuf.truncate(offset) + comfy.model_management.TOTAL_PINNED_MEMORY -= size + if comfy.model_management.TOTAL_PINNED_MEMORY < 0: + comfy.model_management.TOTAL_PINNED_MEMORY = 0 + freed += size + ram_to_unload -= size + if ram_to_unload <= 0: + return freed return freed def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False): diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index a35759aad382..208c777f8f33 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -5,16 +5,17 @@ from comfy.cli_args import args -def get_pin(module): +def get_pin(module, subset="weights"): return getattr(module, "_pin", None) -def pin_memory(module): +def pin_memory(module, subset="weights", size=None): pin_state = module._pin_state - if pin_state["failed"] or args.disable_pinned_memory or get_pin(module) is not None: + if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None: return - hostbuf = pin_state["hostbuf"] - size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) + hostbuf, stack = pin_state[subset] + if size is None: + size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size comfy.model_management.ensure_pin_budget(size) @@ -26,6 +27,6 @@ def pin_memory(module): module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] module._pin.untyped_storage()._comfy_hostbuf = hostbuf - pin_state["stack"].append((module, offset)) + stack.append((module, offset)) comfy.model_management.TOTAL_PINNED_MEMORY += size return True From 3a3b75a7e3cc1175e3f9f0d90c5838fb83c9b518 Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 8 May 2026 23:28:43 +1000 Subject: [PATCH 10/23] implement pinned loras --- comfy/model_management.py | 6 ++--- comfy/model_patcher.py | 8 +++--- comfy/ops.py | 55 +++++++++++++++++++++------------------ 3 files changed, 38 insertions(+), 31 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index ca4318a4535e..145a32080605 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -524,7 +524,7 @@ def ensure_pin_budget(size, evict_active=False): shortfall += PIN_PRESSURE_HYSTERESIS for loaded_model in reversed(current_loaded_models): model = loaded_model.model - if model is not None and model.is_dynamic() and (evict_active or not model.dynamic_pins[model.load_device]["active"]): + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): shortfall -= model.partially_unload_ram(shortfall) if shortfall <= 0: break @@ -1237,9 +1237,9 @@ def reset_cast_buffers(): for loaded_model in current_loaded_models: model = loaded_model.model if model is not None and model.is_dynamic(): - model.dynamic_pins[model.load_device]["active"] = False + model.model.dynamic_pins[model.load_device]["active"] = False model.partially_unload_ram(1e30, subsets=[ "patches" ]) - model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0), []) + model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 530db214cc9c..f4845bb43c03 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1557,8 +1557,8 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: self.model.dynamic_pins[self.load_device] = { - "weights": (comfy_aimdo.host_buffer.HostBuffer(0), []), - "patches": (comfy_aimdo.host_buffer.HostBuffer(0), []), + "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []), "failed": False, "active": False, } @@ -1651,7 +1651,9 @@ def setup_param(self, m, n, param_key): if key in self.patches: if comfy.lora.calculate_shape(self.patches[key], weight, key) != weight.shape: return (True, 0) - setattr(m, param_key + "_lowvram_function", LowVramPatch(key, self.patches)) + lowvram_patch = LowVramPatch(key, self.patches) + lowvram_patch._pin_state = pin_state + setattr(m, param_key + "_lowvram_function", lowvram_patch) num_patches += 1 else: setattr(m, param_key + "_lowvram_function", None) diff --git a/comfy/ops.py b/comfy/ops.py index 8603b50a66c7..629b54e4cf9c 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -183,34 +183,45 @@ def get_stream_pin_buffer_offset(buffer_size): if xfer_dest is None: xfer_dest = get_cast_buffer(dest_size) - if pin is None: + def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): + if xfer_source is not None: + if getattr(xfer_source, "is_lowvram_patch", False): + xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) + else: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) + + def handle_pin_miss(m, source, dest, subset="weights", size=None): + pin = None if signature is None: - comfy.pinned_memory.pin_memory(s) - pin = comfy.pinned_memory.get_pin(s) + comfy.pinned_memory.pin_memory(m, subset=subset, size=size) + pin = comfy.pinned_memory.get_pin(m, subset=subset) if pin is not None: - comfy.model_management.cast_to_gathered(xfer_source, pin) - xfer_source = [ pin ] + cast_maybe_lowvram_patch(source, pin, None) + return [ pin ] if pin is None: - pin_offset = get_stream_pin_buffer_offset(dest_size) + pin_offset = get_stream_pin_buffer_offset(size) if pin_offset is not None: - stream_pin_queue.append((xfer_source, pin_offset, dest_size, xfer_dest)) - xfer_source = None + stream_pin_queue.append((source, pin_offset, size, dest)) + return None + return source - if xfer_source is not None: - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + if pin is None: + xfer_source = handle_pin_miss(s, xfer_source, xfer_dest, size=dest_size) + + cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) for param_key in ("weight", "bias"): - lowvram_fn = getattr(s, param_key + "_lowvram_function", None) - if lowvram_fn is not None: + lowvram_source = getattr(s, param_key + "_lowvram_function", None) + if lowvram_source is not None: ensure_offload_stream(s, cast_buffer_offset, False) - lowvram_size = lowvram_fn.memory_required() + lowvram_size = lowvram_source.memory_required() lowvram_dest = get_cast_buffer(lowvram_size) - lowvram_fn.prepare(lowvram_dest, None, copy=False, commit=True) - pin_offset = get_stream_pin_buffer_offset(lowvram_size) - if pin_offset is not None: - stream_pin_queue.append((lowvram_fn, pin_offset, lowvram_size, lowvram_dest)) - else: - lowvram_fn.prepare(lowvram_dest, offload_stream, copy=True, commit=True) + lowvram_source.prepare(lowvram_dest, None, copy=False, commit=True) + + pin = comfy.pinned_memory.get_pin(lowvram_source, subset="patches") + lowvram_source = handle_pin_miss(lowvram_source, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size) if pin is None else [ pin ] + + cast_maybe_lowvram_patch(lowvram_source, lowvram_dest, offload_stream) prefetch["xfer_dest"] = xfer_dest prefetch["cast_dest"] = cast_dest @@ -218,12 +229,6 @@ def get_stream_pin_buffer_offset(buffer_size): prefetch["needs_cast"] = needs_cast s._prefetch = prefetch - def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): - if getattr(xfer_source, "is_lowvram_patch", False): - xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) - else: - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) - if stream_pin_offset > 0: if stream_pin_hostbuf.size < stream_pin_offset: if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM): From c395f2d5b7ec83b22c597e0e0d936e3cc35f822e Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 8 May 2026 23:30:47 +1000 Subject: [PATCH 11/23] requirements: comfy-aimdo 0.4.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c5a6f4cec2ce..eba0fc5ca757 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0 filelock av>=14.2.0 comfy-kitchen>=0.2.8 -comfy-aimdo==0.3.0 +comfy-aimdo==0.4.0 requests simpleeval>=1.0.0 blake3 From 44c0a0602b575287b48f09cdb11e6969683e39da Mon Sep 17 00:00:00 2001 From: Rattus Date: Mon, 11 May 2026 18:51:39 +1000 Subject: [PATCH 12/23] ops: remove unused arg This was defeatured in aimdo iteration --- comfy/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index 629b54e4cf9c..d425ea7eb3dc 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -235,7 +235,7 @@ def handle_pin_miss(m, source, dest, subset="weights", size=None): for xfer_source, _, _, xfer_dest in stream_pin_queue: cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) return offload_stream - stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset) + stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf) stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] From ee927aafa8770a4bea8cfc2fcbedba5f86656097 Mon Sep 17 00:00:00 2001 From: Rattus Date: Sat, 9 May 2026 18:41:57 +1000 Subject: [PATCH 13/23] ops: sync the CPU with only the offload stream activity This was syncing with the offload stream which itself is synced with the compute stream, so this was syncing CPU with compute transitively. Define the event to sync it more gently. --- comfy/model_management.py | 5 ++++- comfy/ops.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 145a32080605..c1d0901fc61d 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1196,7 +1196,10 @@ def get_pin_buffer(offload_stream): pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0) STREAM_PIN_BUFFERS[offload_stream] = pin_buffer elif offload_stream is not None: - offload_stream.synchronize() + event = getattr(pin_buffer, "_comfy_event", None) + if event is not None: + event.synchronize() + delattr(pin_buffer, "_comfy_event") return pin_buffer def resize_pin_buffer(pin_buffer, size): diff --git a/comfy/ops.py b/comfy/ops.py index d425ea7eb3dc..be744a030b05 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -241,6 +241,7 @@ def handle_pin_miss(m, source, dest, subset="weights", size=None): pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] cast_maybe_lowvram_patch(xfer_source, pin, None) comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) + stream_pin_hostbuf._comfy_event = offload_stream.record_event() return offload_stream From d61026d020946b986a3a4a1969d9198c90b7e8ec Mon Sep 17 00:00:00 2001 From: Rattus Date: Wed, 13 May 2026 09:17:23 +1000 Subject: [PATCH 14/23] pins: implement freeing intermediate for pinned memory Pinning is more important than inactive intermediates and the stream pin buffer is more important than even active intermediates. --- comfy/memory_management.py | 4 ++-- comfy/model_management.py | 2 ++ comfy/pinned_memory.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/comfy/memory_management.py b/comfy/memory_management.py index 4a628b05c18f..7645064f59a6 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -157,7 +157,7 @@ def set_ram_cache_release_state(callback, headroom): extra_ram_release_callback = callback RAM_CACHE_HEADROOM = max(0, int(headroom)) -def extra_ram_release(target): +def extra_ram_release(target, free_active=False): if extra_ram_release_callback is None: return 0 - return extra_ram_release_callback(target) + return extra_ram_release_callback(target, free_active=free_active) diff --git a/comfy/model_management.py b/comfy/model_management.py index c1d0901fc61d..697359d3a07d 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1208,6 +1208,7 @@ def resize_pin_buffer(pin_buffer, size): if size <= old_size: return True growth = size - old_size + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True) ensure_pin_budget(growth, evict_active=True) try: pin_buffer.extend(size=size, reallocate=True) @@ -1389,6 +1390,7 @@ def pin_memory(tensor): return False size = tensor.nbytes + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) ensure_pin_budget(size) ptr = tensor.data_ptr() diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 208c777f8f33..35cbbcd9e215 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -17,6 +17,7 @@ def pin_memory(module, subset="weights", size=None): if size is None: size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) comfy.model_management.ensure_pin_budget(size) try: From 3f717816e1f194fa2a9a105fb425b7bbfbb781f7 Mon Sep 17 00:00:00 2001 From: Rattus Date: Wed, 13 May 2026 21:57:35 +1000 Subject: [PATCH 15/23] execution: implement pin eviction on RAM presure Add back proper pin freeing on RAM pressure --- comfy/model_management.py | 19 ++++++++++++------- execution.py | 5 ++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 697359d3a07d..f358621c9b83 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -513,6 +513,17 @@ def mark_mmap_dirty(storage): if mmap_refs is not None: DIRTY_MMAPS.add(mmap_refs[0]) +def free_pins(size, evict_active=False): + if size <= 0: + return + + for loaded_model in reversed(current_loaded_models): + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + size -= model.partially_unload_ram(size) + if size <= 0: + break + def ensure_pin_budget(size, evict_active=False): if MAX_PINNED_MEMORY <= 0: return @@ -521,13 +532,7 @@ def ensure_pin_budget(size, evict_active=False): if shortfall <= 0: return - shortfall += PIN_PRESSURE_HYSTERESIS - for loaded_model in reversed(current_loaded_models): - model = loaded_model.model - if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): - shortfall -= model.partially_unload_ram(shortfall) - if shortfall <= 0: - break + free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active) class LoadedModel: def __init__(self, model): diff --git a/execution.py b/execution.py index f37d0360d55c..5605f09e7d1d 100644 --- a/execution.py +++ b/execution.py @@ -2,6 +2,7 @@ import heapq import inspect import logging +import psutil import sys import threading import time @@ -780,7 +781,9 @@ async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs= execution_list.complete_node_execution() if self.cache_type == CacheType.RAM_PRESSURE: - comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom) + ram_release_callback(ram_headroom) + ram_shortfall = ram_headroom - psutil.virtual_memory().available + comfy.model_management.free_pins(ram_shortfall) ram_release_callback(ram_headroom, free_active=True) else: # Only execute when the while-loop ends without break From 31150538b0a75734e46bc1f7a0d2bb1fce5d1fa9 Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 15 May 2026 00:28:13 +1000 Subject: [PATCH 16/23] implement pin registration swaps Uncap the windows pins from 50% by extending the pool and have a pressure mechanism to move the pin reservations om demand. This unfortunately implies a GPU sync to do the freeing so significant hysterisis needs to be added to consolidate these pressure events. --- comfy/model_management.py | 49 +++++++++++++++++++++++++++++---------- comfy/model_patcher.py | 39 +++++++++++++++++++++++++------ comfy/pinned_memory.py | 27 +++++++++++++++++++-- 3 files changed, 94 insertions(+), 21 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index f358621c9b83..19a9163620e6 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -498,7 +498,11 @@ def get_torch_device_name(device): DIRTY_MMAPS = set() -PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024 +PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024 + +#Freeing registerables on pressure does imply a GPU sync, so go big on +#the hysteresis so each expensive sync gives us back a good chunk. +REGISTERABLE_PIN_HYSTERESIS = 768 * 1024 * 1024 def module_size(module): module_mem = 0 @@ -525,15 +529,28 @@ def free_pins(size, evict_active=False): break def ensure_pin_budget(size, evict_active=False): - if MAX_PINNED_MEMORY <= 0: + if MAX_MODEL_MEMORY <= 0: return - shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + shortfall = TOTAL_MODEL_MEMORY + size - MAX_MODEL_MEMORY if shortfall <= 0: return free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active) +def ensure_pin_registerable(size, evict_active=False): + shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + if MAX_PINNED_MEMORY <= 0 or shortfall <= 0: + return + + shortfall += REGISTERABLE_PIN_HYSTERESIS + for loaded_model in reversed(current_loaded_models): + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + shortfall -= model.unregister_inactive_pins(shortfall) + if shortfall <= 0: + return + class LoadedModel: def __init__(self, model): self._set_model(model) @@ -1208,22 +1225,24 @@ def get_pin_buffer(offload_stream): return pin_buffer def resize_pin_buffer(pin_buffer, size): - global TOTAL_PINNED_MEMORY + global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY old_size = pin_buffer.size if size <= old_size: return True growth = size - old_size comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True) ensure_pin_budget(growth, evict_active=True) + ensure_pin_registerable(growth, evict_active=True) try: pin_buffer.extend(size=size, reallocate=True) except RuntimeError: return False + TOTAL_MODEL_MEMORY += pin_buffer.size - old_size TOTAL_PINNED_MEMORY += pin_buffer.size - old_size return True def reset_cast_buffers(): - global TOTAL_PINNED_MEMORY + global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT @@ -1239,16 +1258,17 @@ def reset_cast_buffers(): DIRTY_MMAPS.clear() for pin_buffer in STREAM_PIN_BUFFERS.values(): + TOTAL_MODEL_MEMORY -= pin_buffer.size TOTAL_PINNED_MEMORY -= pin_buffer.size - if TOTAL_PINNED_MEMORY < 0: - TOTAL_PINNED_MEMORY = 0 + TOTAL_MODEL_MEMORY = max(0, TOTAL_MODEL_MEMORY) + TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY) for loaded_model in current_loaded_models: model = loaded_model.model if model is not None and model.is_dynamic(): model.model.dynamic_pins[model.load_device]["active"] = False model.partially_unload_ram(1e30, subsets=[ "patches" ]) - model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []) + model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() @@ -1352,14 +1372,18 @@ def cast_to_device(tensor, device, dtype, copy=False): PINNED_MEMORY = {} +TOTAL_MODEL_MEMORY = 0 TOTAL_PINNED_MEMORY = 0 +MAX_MODEL_MEMORY = -1 MAX_PINNED_MEMORY = -1 if not args.disable_pinned_memory: if is_nvidia() or is_amd(): + ram = get_total_memory(torch.device("cpu")) + MAX_MODEL_MEMORY = min(ram - 4 * 1024 * 1024 * 1024, ram * 0.90) if WINDOWS: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50% + MAX_PINNED_MEMORY = ram * 0.40 # Windows limit is apparently 50% else: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90 + MAX_PINNED_MEMORY = ram * 0.90 logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"]) @@ -1396,7 +1420,7 @@ def pin_memory(tensor): size = tensor.nbytes comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) - ensure_pin_budget(size) + ensure_pin_registerable(size) ptr = tensor.data_ptr() if ptr == 0: @@ -1433,7 +1457,8 @@ def unpin_memory(tensor): return False if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: - TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) + size = PINNED_MEMORY.pop(ptr) + TOTAL_PINNED_MEMORY -= size return True else: logging.warning("Unpin error.") diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index f4845bb43c03..7dc4d7801439 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1557,8 +1557,8 @@ def __init__(self, model, load_device, offload_device, size=0, weight_inplace_up self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: self.model.dynamic_pins[self.load_device] = { - "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []), - "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []), + "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), [], [-1]), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]), "failed": False, "active": False, } @@ -1761,19 +1761,44 @@ def pinned_memory_size(self): return (self.model.dynamic_pins[self.load_device]["weights"][0].size + self.model.dynamic_pins[self.load_device]["patches"][0].size) + def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]): + freed = 0 + pin_state = self.model.dynamic_pins[self.load_device] + for subset in subsets: + hostbuf, stack, stack_split = pin_state[subset] + split = stack_split[0] + while split >= 0: + module, offset = stack[split] + split -= 1 + stack_split[0] = split + if not module._pin_registered: + continue + size = module._pin.numel() * module._pin.element_size() + if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0: + comfy.model_management.discard_cuda_async_error() + continue + module._pin_registered = False + comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size) + freed += size + ram_to_unload -= size + if ram_to_unload <= 0: + return freed + return freed + def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] for subset in subsets: - hostbuf, stack = pin_state[subset] + hostbuf, stack, stack_split = pin_state[subset] while len(stack) > 0: module, offset = stack.pop() size = module._pin.numel() * module._pin.element_size() del module._pin - hostbuf.truncate(offset) - comfy.model_management.TOTAL_PINNED_MEMORY -= size - if comfy.model_management.TOTAL_PINNED_MEMORY < 0: - comfy.model_management.TOTAL_PINNED_MEMORY = 0 + hostbuf.truncate(offset, do_unregister=module._pin_registered) + stack_split[0] = min(stack_split[0], len(stack) - 1) + comfy.model_management.TOTAL_MODEL_MEMORY = max(0, comfy.model_management.TOTAL_MODEL_MEMORY - size) + if module._pin_registered: + comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size) freed += size ram_to_unload -= size if ram_to_unload <= 0: diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 35cbbcd9e215..8fe69916f988 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -2,6 +2,7 @@ import comfy.memory_management import comfy_aimdo.host_buffer import comfy_aimdo.torch +import torch from comfy.cli_args import args @@ -10,15 +11,33 @@ def get_pin(module, subset="weights"): def pin_memory(module, subset="weights", size=None): pin_state = module._pin_state - if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None: + if pin_state["failed"] or args.disable_pinned_memory: return - hostbuf, stack = pin_state[subset] + hostbuf, stack, stack_split = pin_state[subset] + pin = get_pin(module, subset) + if pin is not None: + if module._pin_registered: + return + + size = module._pin.nbytes + comfy.model_management.ensure_pin_registerable(size) + + if torch.cuda.cudart().cudaHostRegister(module._pin.data_ptr(), size, 1) != 0: + comfy.model_management.discard_cuda_async_error() + return False + module._pin_registered = True + stack_split[0] = max(stack_split[0], module._pin_stack_index) + comfy.model_management.TOTAL_PINNED_MEMORY += size + return True + if size is None: size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) comfy.model_management.ensure_pin_budget(size) + comfy.model_management.ensure_pin_registerable(size) try: hostbuf.extend(size=size) @@ -29,5 +48,9 @@ def pin_memory(module, subset="weights", size=None): module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] module._pin.untyped_storage()._comfy_hostbuf = hostbuf stack.append((module, offset)) + module._pin_registered = True + module._pin_stack_index = len(stack) - 1 + stack_split[0] = max(stack_split[0], module._pin_stack_index) + comfy.model_management.TOTAL_MODEL_MEMORY += size comfy.model_management.TOTAL_PINNED_MEMORY += size return True From 18a74cb96ab6137f67229ffc0aa7e0f11a1e5ff3 Mon Sep 17 00:00:00 2001 From: Rattus Date: Wed, 13 May 2026 22:15:54 +1000 Subject: [PATCH 17/23] cli_args/execution: Implement lower background cache-ram threshold Limit the amount of RAM background intermediates can use, so that switching workflows doesn't degrade performance too much. --- comfy/cli_args.py | 7 ++++--- execution.py | 3 ++- main.py | 14 ++++++++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 9dadb0093bf0..e0d7d4af4b48 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -110,13 +110,11 @@ def from_string(cls, value: str): parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.") -CACHE_RAM_AUTO_GB = -1.0 - cache_group = parser.add_mutually_exclusive_group() cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.") cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.") cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.") -cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).") +cache_group.add_argument("--cache-ram", nargs='*', type=float, default=None, metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).") attn_group = parser.add_mutually_exclusive_group() attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.") @@ -246,6 +244,9 @@ def is_valid_directory(path: str) -> str: else: args = parser.parse_args([]) +if args.cache_ram is not None and len(args.cache_ram) > 2: + parser.error("--cache-ram accepts at most two values: active GB and inactive GB") + if args.windows_standalone_build: args.auto_launch = True diff --git a/execution.py b/execution.py index 5605f09e7d1d..9c3968810631 100644 --- a/execution.py +++ b/execution.py @@ -728,6 +728,7 @@ async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs= self._notify_prompt_lifecycle("start", prompt_id) ram_headroom = int(self.cache_args["ram"] * (1024 ** 3)) + ram_inactive_headroom = int(self.cache_args["ram_inactive"] * (1024 ** 3)) ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom) @@ -781,7 +782,7 @@ async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs= execution_list.complete_node_execution() if self.cache_type == CacheType.RAM_PRESSURE: - ram_release_callback(ram_headroom) + ram_release_callback(ram_inactive_headroom) ram_shortfall = ram_headroom - psutil.virtual_memory().available comfy.model_management.free_pins(ram_shortfall) ram_release_callback(ram_headroom, free_active=True) diff --git a/main.py b/main.py index a6fdaf43c7db..ad9742252be2 100644 --- a/main.py +++ b/main.py @@ -283,19 +283,25 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]: def prompt_worker(q, server_instance): current_time: float = 0.0 - cache_ram = args.cache_ram - if cache_ram < 0: + cache_ram = 0 + cache_ram_inactive = 0 + if args.cache_ram is not None: cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0)) + cache_ram_inactive = min(96.0, max(12.0, comfy.model_management.total_ram * 0.75 / 1024.0)) + if len(args.cache_ram) > 0: + cache_ram = args.cache_ram[0] + if len(args.cache_ram) > 1: + cache_ram_inactive = args.cache_ram[1] cache_type = execution.CacheType.CLASSIC if args.cache_lru > 0: cache_type = execution.CacheType.LRU - elif cache_ram > 0: + elif max(cache_ram, cache_ram_inactive) > 0: cache_type = execution.CacheType.RAM_PRESSURE elif args.cache_none: cache_type = execution.CacheType.NONE - e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } ) + e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram, "ram_inactive" : cache_ram_inactive } ) last_gc_collect = 0 need_gc = False gc_collect_interval = 10.0 From d8b442709a5607a3f82bcf5a03f2f81b1cdacadc Mon Sep 17 00:00:00 2001 From: Rattus Date: Wed, 13 May 2026 22:23:37 +1000 Subject: [PATCH 18/23] make default --- comfy/cli_args.py | 2 +- main.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index e0d7d4af4b48..d5d13008b67c 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -111,10 +111,10 @@ def from_string(cls, value: str): parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.") cache_group = parser.add_mutually_exclusive_group() +cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).") cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.") cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.") cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.") -cache_group.add_argument("--cache-ram", nargs='*', type=float, default=None, metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).") attn_group = parser.add_mutually_exclusive_group() attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.") diff --git a/main.py b/main.py index ad9742252be2..1e47cab84cb7 100644 --- a/main.py +++ b/main.py @@ -285,7 +285,7 @@ def prompt_worker(q, server_instance): current_time: float = 0.0 cache_ram = 0 cache_ram_inactive = 0 - if args.cache_ram is not None: + if not args.cache_classic and not args.cache_none and args.cache_lru <= 0: cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0)) cache_ram_inactive = min(96.0, max(12.0, comfy.model_management.total_ram * 0.75 / 1024.0)) if len(args.cache_ram) > 0: @@ -293,11 +293,11 @@ def prompt_worker(q, server_instance): if len(args.cache_ram) > 1: cache_ram_inactive = args.cache_ram[1] - cache_type = execution.CacheType.CLASSIC - if args.cache_lru > 0: + cache_type = execution.CacheType.RAM_PRESSURE + if args.cache_classic: + cache_type = execution.CacheType.CLASSIC + elif args.cache_lru > 0: cache_type = execution.CacheType.LRU - elif max(cache_ram, cache_ram_inactive) > 0: - cache_type = execution.CacheType.RAM_PRESSURE elif args.cache_none: cache_type = execution.CacheType.NONE From 55197d8bfc1cddd484e61dd01bfd35af8e49fa97 Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 15 May 2026 13:19:57 +1000 Subject: [PATCH 19/23] bump aimdo --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index eba0fc5ca757..6754c94c4226 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0 filelock av>=14.2.0 comfy-kitchen>=0.2.8 -comfy-aimdo==0.4.0 +comfy-aimdo==0.4.1 requests simpleeval>=1.0.0 blake3 From 0242954aaa24a5262cd59434a555deaf513129b9 Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 15 May 2026 20:02:09 +1000 Subject: [PATCH 20/23] model-patcher: force-cast tiny weights Flux 2 gets crazy stalls due to a mix of tiny and giant weights creating lopsided steam buffer rotations which creates stalls. --- comfy/model_patcher.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 7dc4d7801439..e1cd3283123e 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1670,6 +1670,9 @@ def setup_param(self, m, n, param_key): def force_load_param(self, param_key, device_to): key = key_param_name_to_key(n, param_key) + weight, _, _ = get_key_weight(self.model, key) + if weight is None: + return if key in self.backup: comfy.utils.set_attr_param(self.model, key, self.backup[key].weight) self.patch_weight_to_device(key, device_to=device_to, force_cast=True) @@ -1683,13 +1686,19 @@ def force_load_param(self, param_key, device_to): m._pin_state = pin_state set_dirty(m, dirty) - force_load, v_weight_size = setup_param(self, m, n, "weight") - force_load_bias, v_weight_bias = setup_param(self, m, n, "bias") - force_load = force_load or force_load_bias - v_weight_size += v_weight_bias + #Models that mix tiny and giant weights can causing lopsided stream buffer + #rotations and stall. force the tinys over. + if module_mem > 16 * 1024: + force_load, v_weight_size = setup_param(self, m, n, "weight") + force_load_bias, v_weight_bias = setup_param(self, m, n, "bias") + force_load = force_load or force_load_bias + v_weight_size += v_weight_bias + if force_load: + logging.info(f"Module {n} has resizing Lora - force loading") + else: + force_load=True if force_load: - logging.info(f"Module {n} has resizing Lora - force loading") force_load_param(self, "weight", device_to) force_load_param(self, "bias", device_to) else: From ed15d62a6c6f7ed645e26b5917a242922df1c5a0 Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 15 May 2026 22:26:17 +1000 Subject: [PATCH 21/23] ops: refactor in prep for chunking --- comfy/ops.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index be744a030b05..4b436f4a7510 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -190,25 +190,25 @@ def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): else: comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) - def handle_pin_miss(m, source, dest, subset="weights", size=None): - pin = None + def handle_pin(m, pin, source, dest, subset="weights", size=None): + if pin is not None: + cast_maybe_lowvram_patch([pin], dest, offload_stream) + return if signature is None: comfy.pinned_memory.pin_memory(m, subset=subset, size=size) pin = comfy.pinned_memory.get_pin(m, subset=subset) if pin is not None: cast_maybe_lowvram_patch(source, pin, None) - return [ pin ] + cast_maybe_lowvram_patch([ pin ], dest, offload_stream) + return if pin is None: pin_offset = get_stream_pin_buffer_offset(size) if pin_offset is not None: stream_pin_queue.append((source, pin_offset, size, dest)) - return None - return source - - if pin is None: - xfer_source = handle_pin_miss(s, xfer_source, xfer_dest, size=dest_size) + return + cast_maybe_lowvram_patch(source, dest, offload_stream) - cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) + handle_pin(s, pin, xfer_source, xfer_dest, size=dest_size) for param_key in ("weight", "bias"): lowvram_source = getattr(s, param_key + "_lowvram_function", None) @@ -219,9 +219,8 @@ def handle_pin_miss(m, source, dest, subset="weights", size=None): lowvram_source.prepare(lowvram_dest, None, copy=False, commit=True) pin = comfy.pinned_memory.get_pin(lowvram_source, subset="patches") - lowvram_source = handle_pin_miss(lowvram_source, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size) if pin is None else [ pin ] + handle_pin(lowvram_source, pin, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size) - cast_maybe_lowvram_patch(lowvram_source, lowvram_dest, offload_stream) prefetch["xfer_dest"] = xfer_dest prefetch["cast_dest"] = cast_dest From 43865639a74841dfaada96ca49485f570e231b46 Mon Sep 17 00:00:00 2001 From: Rattus Date: Sat, 16 May 2026 02:48:51 +1000 Subject: [PATCH 22/23] mm: delegate pin-on-the-way to aimdo Aimdo is able to chunk and slice this on the way for better CPU->GPU overlap. The main advantage is the ability to shorten the bus contention window between previous weight transfer and the next weights vbar fault. --- comfy/memory_management.py | 15 ++++++++++++--- comfy/model_management.py | 8 ++++++-- comfy/ops.py | 14 ++++++++++---- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/comfy/memory_management.py b/comfy/memory_management.py index 7645064f59a6..21e3cf59b426 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -15,7 +15,7 @@ class TensorFileSlice(NamedTuple): size: int -def read_tensor_file_slice_into(tensor, destination): +def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None): if isinstance(tensor, QuantizedTensor): if not isinstance(destination, QuantizedTensor): @@ -23,12 +23,17 @@ def read_tensor_file_slice_into(tensor, destination): if tensor._layout_cls != destination._layout_cls: return False - if not read_tensor_file_slice_into(tensor._qdata, destination._qdata): + if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream, + destination2=(destination2._qdata if destination2 is not None else None)): return False dst_orig_dtype = destination._params.orig_dtype destination._params.copy_from(tensor._params, non_blocking=False) destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype) + if destination2 is not None: + dst_orig_dtype = destination2._params.orig_dtype + destination2._params.copy_from(destination._params, non_blocking=True) + destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype) return True info = getattr(tensor.untyped_storage(), "_comfy_tensor_file_slice", None) @@ -50,8 +55,12 @@ def read_tensor_file_slice_into(tensor, destination): hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None) if hostbuf is not None: + stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0 + device_ptr = destination2.data_ptr() if destination2 is not None else 0 hostbuf.read_file_slice(file_obj, info.offset, info.size, - offset=destination.data_ptr() - hostbuf.get_raw_address()) + offset=destination.data_ptr() - hostbuf.get_raw_address(), + stream=stream_ptr, + device_ptr=device_ptr) return True buf_type = ctypes.c_ubyte * info.size diff --git a/comfy/model_management.py b/comfy/model_management.py index 19a9163620e6..72ef77ee997c 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1318,7 +1318,7 @@ def sync_stream(device, stream): current_stream(device).wait_stream(stream) -def cast_to_gathered(tensors, r, non_blocking=False, stream=None): +def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None): wf_context = nullcontext() if stream is not None: wf_context = stream @@ -1326,16 +1326,20 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None): wf_context = wf_context.as_context(stream) dest_views = comfy.memory_management.interpret_gathered_like(tensors, r) + dest2_views = comfy.memory_management.interpret_gathered_like(tensors, r2) if r2 is not None else None with wf_context: for tensor in tensors: dest_view = dest_views.pop(0) + dest2_view = dest2_views.pop(0) if dest2_views is not None else None if tensor is None: continue - if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view): + if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view, stream=stream, destination2=dest2_view): continue storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() mark_mmap_dirty(storage) dest_view.copy_(tensor, non_blocking=non_blocking) + if dest2_view is not None: + dest2_view.copy_(dest_view, non_blocking=non_blocking) def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None): diff --git a/comfy/ops.py b/comfy/ops.py index 4b436f4a7510..2f364a3a54a6 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -198,8 +198,11 @@ def handle_pin(m, pin, source, dest, subset="weights", size=None): comfy.pinned_memory.pin_memory(m, subset=subset, size=size) pin = comfy.pinned_memory.get_pin(m, subset=subset) if pin is not None: - cast_maybe_lowvram_patch(source, pin, None) - cast_maybe_lowvram_patch([ pin ], dest, offload_stream) + if isinstance(source, list): + comfy.model_management.cast_to_gathered(source, pin, non_blocking=non_blocking, stream=offload_stream, r2=dest) + else: + cast_maybe_lowvram_patch(source, pin, None) + cast_maybe_lowvram_patch([ pin ], dest, offload_stream) return if pin is None: pin_offset = get_stream_pin_buffer_offset(size) @@ -238,8 +241,11 @@ def handle_pin(m, pin, source, dest, subset="weights", size=None): stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] - cast_maybe_lowvram_patch(xfer_source, pin, None) - comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) + if isinstance(xfer_source, list): + comfy.model_management.cast_to_gathered(xfer_source, pin, non_blocking=non_blocking, stream=offload_stream, r2=xfer_dest) + else: + cast_maybe_lowvram_patch(xfer_source, pin, None) + comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) stream_pin_hostbuf._comfy_event = offload_stream.record_event() return offload_stream From 52a68b9b1c11b177ef62dce9eef8cdc1ac9d1db4 Mon Sep 17 00:00:00 2001 From: Rattus Date: Sat, 16 May 2026 02:56:24 +1000 Subject: [PATCH 23/23] bump aimdo --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6754c94c4226..193d60cf04d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0 filelock av>=14.2.0 comfy-kitchen>=0.2.8 -comfy-aimdo==0.4.1 +comfy-aimdo==0.4.2 requests simpleeval>=1.0.0 blake3