Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 38 additions & 9 deletions kv_cache_benchmark/kv_cache/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,22 @@ class NVMeBackend(StorageBackend):
This is the third and slowest tier, used for offloading from CPU RAM.
"""

def __init__(self, base_path: str = None):
def __init__(self, base_path: str = None, use_mmap: bool = False):
"""
Initialize NVMe backend.

Args:
base_path: Directory for cache files. If None, uses temp directory.
use_mmap: If True, use memory-mapped loading (np.load(mmap_mode='r') + np.array()).
If False (default), use direct loading (np.load() only).

Note:
- use_mmap=False: Faster for multi-threaded workloads (avoids memory allocation contention)
- use_mmap=True: May be faster for single-threaded or when memory is constrained
"""
self.temp_dir = None
self.use_mmap = use_mmap

if base_path is None:
self.temp_dir = tempfile.TemporaryDirectory(prefix="kv_cache_")
self.base_path = Path(self.temp_dir.name)
Expand Down Expand Up @@ -304,14 +318,29 @@ def read(self, key: str) -> Tuple[np.ndarray, StorageBackend.IOTiming]:
pass

pre_load = time.perf_counter()
data = np.load(path, allow_pickle=False)
load_done = time.perf_counter()
data = np.array(data)
copy_done = time.perf_counter()

device_time = load_done - pre_load
host_time = (pre_load - start) + (copy_done - load_done)
total = copy_done - start

if self.use_mmap:
# Memory-mapped mode: Load as mmap, then copy to array
# This can be faster in single-threaded scenarios or when memory is constrained
data = np.load(path, mmap_mode='r', allow_pickle=False)
mmap_done = time.perf_counter()
data = np.array(data) # Copy from mmap to writable array
copy_done = time.perf_counter()

# Device time = mmap creation + copy (actual data movement)
device_time = (mmap_done - pre_load) + (copy_done - mmap_done)
host_time = pre_load - start # Just cache drop overhead
total = copy_done - start
else:
# Direct mode (default): Load directly into memory
# This is faster for multi-threaded workloads (avoids memory allocation contention)
data = np.load(path, mmap_mode=None, allow_pickle=False)
load_done = time.perf_counter()

device_time = load_done - pre_load # Disk I/O + deserialization time
host_time = pre_load - start # Just cache drop overhead
total = load_done - start

return data, StorageBackend.IOTiming(total=total, device=device_time, host=host_time)

def delete(self, key: str):
Expand Down
2 changes: 2 additions & 0 deletions kv_cache_benchmark/kv_cache/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(self,
cpu_memory_gb: float,
duration_seconds: int,
cache_dir: str = None,
use_mmap: bool = False,
enable_autoscaling: bool = False,
autoscaler_mode: str = 'qos',
target_saturation: float = 0.8,
Expand Down Expand Up @@ -125,6 +126,7 @@ def __init__(self,
gpu_memory_gb=gpu_memory_gb,
cpu_memory_gb=cpu_memory_gb,
cache_dir=cache_dir,
use_mmap=use_mmap,
performance_profile=performance_profile,
seed=seed,
max_concurrent_allocs=max_concurrent_allocs,
Expand Down
3 changes: 2 additions & 1 deletion kv_cache_benchmark/kv_cache/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ def __init__(self,
gpu_memory_gb: float,
cpu_memory_gb: float,
cache_dir: str = None,
use_mmap: bool = False,
eviction_policy: str = 'lru',
performance_profile: str = 'latency',
seed: Optional[int] = None,
Expand All @@ -237,7 +238,7 @@ def __init__(self,
logger.warning(f"Could not initialize GPU backend: {e}")

self.backends['cpu'] = CPUMemoryBackend()
self.backends['nvme'] = NVMeBackend(base_path=cache_dir)
self.backends['nvme'] = NVMeBackend(base_path=cache_dir, use_mmap=use_mmap)

self.generator = KVCacheGenerator(model_config, global_seed=self.seed)

Expand Down
4 changes: 4 additions & 0 deletions kv_cache_benchmark/kv_cache/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,9 @@ def main():
help='The amount of CPU memory (RAM) to allocate for the cache in GB.')
parser.add_argument('--cache-dir', type=str, default=None,
help='The directory to use for the NVMe cache tier.')
parser.add_argument('--use-mmap', action='store_true',
help='Use memory-mapped loading (np.load(mmap_mode="r") + np.array()). '
'Default is False (direct loading with np.load() only), which is faster for multi-threaded workloads.')
parser.add_argument('--generation-mode', type=str, default='realistic', choices=[g.value for g in GenerationMode],
help='The token generation speed simulation mode.')
parser.add_argument('--performance-profile', type=str, default='latency', choices=['latency', 'throughput'],
Expand Down Expand Up @@ -352,6 +355,7 @@ def main():
cpu_memory_gb=args.cpu_mem_gb,
duration_seconds=args.duration,
cache_dir=args.cache_dir,
use_mmap=args.use_mmap,
enable_autoscaling=args.enable_autoscaling,
autoscaler_mode=args.autoscaler_mode,
target_saturation=args.target_saturation,
Expand Down
Loading