Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1fe3a13
model_management: disable non-dynamic smart memory
rattus128 May 7, 2026
157965a
pinned_memory: implement with aimdo growable buffer
rattus128 May 4, 2026
b66b642
mm: use aimdo to do transfer from disk to pin
rattus128 May 4, 2026
8070cb7
Add stream host pin buffer for AIMDO casts
rattus128 May 7, 2026
1795523
remove old pin path
rattus128 May 7, 2026
8187cd7
Implement JIT pinned memory pressure
rattus128 May 7, 2026
2b927e1
LowVRAMPatch: change to two-phase visit
rattus128 May 7, 2026
8e473d7
lora: re-implement as inplace swiss-army-knife operation
rattus128 May 7, 2026
e48dace
prepare for multiple pin sets
rattus128 May 7, 2026
3a3b75a
implement pinned loras
rattus128 May 8, 2026
c395f2d
requirements: comfy-aimdo 0.4.0
rattus128 May 8, 2026
44c0a06
ops: remove unused arg
rattus128 May 11, 2026
ee927aa
ops: sync the CPU with only the offload stream activity
rattus128 May 9, 2026
d61026d
pins: implement freeing intermediate for pinned memory
rattus128 May 12, 2026
3f71781
execution: implement pin eviction on RAM presure
rattus128 May 13, 2026
3115053
implement pin registration swaps
rattus128 May 14, 2026
18a74cb
cli_args/execution: Implement lower background cache-ram threshold
rattus128 May 13, 2026
d8b4427
make default
rattus128 May 13, 2026
55197d8
bump aimdo
rattus128 May 15, 2026
ea5775c
Merge pull request #9 from rattus128/dev/threaded-loader-2-ram-cache
rattus128 May 15, 2026
0242954
model-patcher: force-cast tiny weights
rattus128 May 15, 2026
ed15d62
ops: refactor in prep for chunking
rattus128 May 15, 2026
4386563
mm: delegate pin-on-the-way to aimdo
rattus128 May 15, 2026
52a68b9
bump aimdo
rattus128 May 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions comfy/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,11 @@ def from_string(cls, value: str):

parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")

CACHE_RAM_AUTO_GB = -1.0

cache_group = parser.add_mutually_exclusive_group()
cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")

attn_group = parser.add_mutually_exclusive_group()
attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
Expand Down Expand Up @@ -246,6 +244,9 @@ def is_valid_directory(path: str) -> str:
else:
args = parser.parse_args([])

if args.cache_ram is not None and len(args.cache_ram) > 2:
parser.error("--cache-ram accepts at most two values: active GB and inactive GB")

if args.windows_standalone_build:
args.auto_launch = True

Expand Down
19 changes: 13 additions & 6 deletions comfy/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,16 +475,23 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori

return weight

def prefetch_prepared_value(value, allocate_buffer, stream):
def prefetch_prepared_value(value, counter, destination, stream, copy):
if isinstance(value, torch.Tensor):
dest = allocate_buffer(comfy.memory_management.vram_aligned_size(value))
comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
size = comfy.memory_management.vram_aligned_size(value)
offset = counter[0]
counter[0] += size
if destination is None:
return value

dest = destination[offset:offset + size]
if copy:
comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
return comfy.memory_management.interpret_gathered_like([value], dest)[0]
elif isinstance(value, weight_adapter.WeightAdapterBase):
return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, allocate_buffer, stream))
return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream, copy))
elif isinstance(value, tuple):
return tuple(prefetch_prepared_value(item, allocate_buffer, stream) for item in value)
return tuple(prefetch_prepared_value(item, counter, destination, stream, copy) for item in value)
elif isinstance(value, list):
return [prefetch_prepared_value(item, allocate_buffer, stream) for item in value]
return [prefetch_prepared_value(item, counter, destination, stream, copy) for item in value]

return value
23 changes: 19 additions & 4 deletions comfy/memory_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,25 @@ class TensorFileSlice(NamedTuple):
size: int


def read_tensor_file_slice_into(tensor, destination):
def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None):

if isinstance(tensor, QuantizedTensor):
if not isinstance(destination, QuantizedTensor):
return False
if tensor._layout_cls != destination._layout_cls:
return False

if not read_tensor_file_slice_into(tensor._qdata, destination._qdata):
if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream,
destination2=(destination2._qdata if destination2 is not None else None)):
return False

dst_orig_dtype = destination._params.orig_dtype
destination._params.copy_from(tensor._params, non_blocking=False)
destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype)
if destination2 is not None:
dst_orig_dtype = destination2._params.orig_dtype
destination2._params.copy_from(destination._params, non_blocking=True)
destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype)
return True

info = getattr(tensor.untyped_storage(), "_comfy_tensor_file_slice", None)
Expand All @@ -48,6 +53,16 @@ def read_tensor_file_slice_into(tensor, destination):
if info.size == 0:
return True

hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None)
if hostbuf is not None:
stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0
device_ptr = destination2.data_ptr() if destination2 is not None else 0
hostbuf.read_file_slice(file_obj, info.offset, info.size,
offset=destination.data_ptr() - hostbuf.get_raw_address(),
stream=stream_ptr,
device_ptr=device_ptr)
return True

buf_type = ctypes.c_ubyte * info.size
view = memoryview(buf_type.from_address(destination.data_ptr()))

Expand Down Expand Up @@ -151,7 +166,7 @@ def set_ram_cache_release_state(callback, headroom):
extra_ram_release_callback = callback
RAM_CACHE_HEADROOM = max(0, int(headroom))

def extra_ram_release(target):
def extra_ram_release(target, free_active=False):
if extra_ram_release_callback is None:
return 0
return extra_ram_release_callback(target)
return extra_ram_release_callback(target, free_active=free_active)
Loading
Loading