diff --git a/kv_cache_benchmark/kv_cache/backends.py b/kv_cache_benchmark/kv_cache/backends.py index cd133e5..8dacac2 100755 --- a/kv_cache_benchmark/kv_cache/backends.py +++ b/kv_cache_benchmark/kv_cache/backends.py @@ -238,8 +238,22 @@ class NVMeBackend(StorageBackend): This is the third and slowest tier, used for offloading from CPU RAM. """ - def __init__(self, base_path: str = None): + def __init__(self, base_path: str = None, use_mmap: bool = False): + """ + Initialize NVMe backend. + + Args: + base_path: Directory for cache files. If None, uses temp directory. + use_mmap: If True, use memory-mapped loading (np.load(mmap_mode='r') + np.array()). + If False (default), use direct loading (np.load() only). + + Note: + - use_mmap=False: Faster for multi-threaded workloads (avoids memory allocation contention) + - use_mmap=True: May be faster for single-threaded or when memory is constrained + """ self.temp_dir = None + self.use_mmap = use_mmap + if base_path is None: self.temp_dir = tempfile.TemporaryDirectory(prefix="kv_cache_") self.base_path = Path(self.temp_dir.name) @@ -304,14 +318,29 @@ def read(self, key: str) -> Tuple[np.ndarray, StorageBackend.IOTiming]: pass pre_load = time.perf_counter() - data = np.load(path, allow_pickle=False) - load_done = time.perf_counter() - data = np.array(data) - copy_done = time.perf_counter() - - device_time = load_done - pre_load - host_time = (pre_load - start) + (copy_done - load_done) - total = copy_done - start + + if self.use_mmap: + # Memory-mapped mode: Load as mmap, then copy to array + # This can be faster in single-threaded scenarios or when memory is constrained + data = np.load(path, mmap_mode='r', allow_pickle=False) + mmap_done = time.perf_counter() + data = np.array(data) # Copy from mmap to writable array + copy_done = time.perf_counter() + + # Device time = mmap creation + copy (actual data movement) + device_time = (mmap_done - pre_load) + (copy_done - mmap_done) + host_time = pre_load - start # Just cache drop overhead + total = copy_done - start + else: + # Direct mode (default): Load directly into memory + # This is faster for multi-threaded workloads (avoids memory allocation contention) + data = np.load(path, mmap_mode=None, allow_pickle=False) + load_done = time.perf_counter() + + device_time = load_done - pre_load # Disk I/O + deserialization time + host_time = pre_load - start # Just cache drop overhead + total = load_done - start + return data, StorageBackend.IOTiming(total=total, device=device_time, host=host_time) def delete(self, key: str): diff --git a/kv_cache_benchmark/kv_cache/benchmark.py b/kv_cache_benchmark/kv_cache/benchmark.py index f345891..3203548 100755 --- a/kv_cache_benchmark/kv_cache/benchmark.py +++ b/kv_cache_benchmark/kv_cache/benchmark.py @@ -48,6 +48,7 @@ def __init__(self, cpu_memory_gb: float, duration_seconds: int, cache_dir: str = None, + use_mmap: bool = False, enable_autoscaling: bool = False, autoscaler_mode: str = 'qos', target_saturation: float = 0.8, @@ -125,6 +126,7 @@ def __init__(self, gpu_memory_gb=gpu_memory_gb, cpu_memory_gb=cpu_memory_gb, cache_dir=cache_dir, + use_mmap=use_mmap, performance_profile=performance_profile, seed=seed, max_concurrent_allocs=max_concurrent_allocs, diff --git a/kv_cache_benchmark/kv_cache/cache.py b/kv_cache_benchmark/kv_cache/cache.py index e1d904a..c9566d8 100755 --- a/kv_cache_benchmark/kv_cache/cache.py +++ b/kv_cache_benchmark/kv_cache/cache.py @@ -211,6 +211,7 @@ def __init__(self, gpu_memory_gb: float, cpu_memory_gb: float, cache_dir: str = None, + use_mmap: bool = False, eviction_policy: str = 'lru', performance_profile: str = 'latency', seed: Optional[int] = None, @@ -237,7 +238,7 @@ def __init__(self, logger.warning(f"Could not initialize GPU backend: {e}") self.backends['cpu'] = CPUMemoryBackend() - self.backends['nvme'] = NVMeBackend(base_path=cache_dir) + self.backends['nvme'] = NVMeBackend(base_path=cache_dir, use_mmap=use_mmap) self.generator = KVCacheGenerator(model_config, global_seed=self.seed) diff --git a/kv_cache_benchmark/kv_cache/cli.py b/kv_cache_benchmark/kv_cache/cli.py index 03864c3..a80aae6 100755 --- a/kv_cache_benchmark/kv_cache/cli.py +++ b/kv_cache_benchmark/kv_cache/cli.py @@ -244,6 +244,9 @@ def main(): help='The amount of CPU memory (RAM) to allocate for the cache in GB.') parser.add_argument('--cache-dir', type=str, default=None, help='The directory to use for the NVMe cache tier.') + parser.add_argument('--use-mmap', action='store_true', + help='Use memory-mapped loading (np.load(mmap_mode="r") + np.array()). ' + 'Default is False (direct loading with np.load() only), which is faster for multi-threaded workloads.') parser.add_argument('--generation-mode', type=str, default='realistic', choices=[g.value for g in GenerationMode], help='The token generation speed simulation mode.') parser.add_argument('--performance-profile', type=str, default='latency', choices=['latency', 'throughput'], @@ -352,6 +355,7 @@ def main(): cpu_memory_gb=args.cpu_mem_gb, duration_seconds=args.duration, cache_dir=args.cache_dir, + use_mmap=args.use_mmap, enable_autoscaling=args.enable_autoscaling, autoscaler_mode=args.autoscaler_mode, target_saturation=args.target_saturation,