mlcommons · LouisDDN · Mar 23, 2026
@@ -238,8 +238,22 @@ class NVMeBackend(StorageBackend):
     This is the third and slowest tier, used for offloading from CPU RAM.
     """
 
-    def __init__(self, base_path: str = None):
+    def __init__(self, base_path: str = None, use_mmap: bool = False):
+        """
+        Initialize NVMe backend.
+
+        Args:
+            base_path: Directory for cache files. If None, uses temp directory.
+            use_mmap: If True, use memory-mapped loading (np.load(mmap_mode='r') + np.array()).
+                     If False (default), use direct loading (np.load() only).
+
+        Note:
+            - use_mmap=False: Faster for multi-threaded workloads (avoids memory allocation contention)
+            - use_mmap=True: May be faster for single-threaded or when memory is constrained
+        """
         self.temp_dir = None
+        self.use_mmap = use_mmap
+
         if base_path is None:
             self.temp_dir = tempfile.TemporaryDirectory(prefix="kv_cache_")
             self.base_path = Path(self.temp_dir.name)
@@ -304,14 +318,29 @@ def read(self, key: str) -> Tuple[np.ndarray, StorageBackend.IOTiming]:
             pass
 
         pre_load = time.perf_counter()
-        data = np.load(path, allow_pickle=False)
-        load_done = time.perf_counter()
-        data = np.array(data)
-        copy_done = time.perf_counter()
-
-        device_time = load_done - pre_load
-        host_time = (pre_load - start) + (copy_done - load_done)
-        total = copy_done - start
+
+        if self.use_mmap:
+            # Memory-mapped mode: Load as mmap, then copy to array
+            # This can be faster in single-threaded scenarios or when memory is constrained
+            data = np.load(path, mmap_mode='r', allow_pickle=False)
+            mmap_done = time.perf_counter()
+            data = np.array(data)  # Copy from mmap to writable array
+            copy_done = time.perf_counter()
+
+            # Device time = mmap creation + copy (actual data movement)
+            device_time = (mmap_done - pre_load) + (copy_done - mmap_done)
+            host_time = pre_load - start  # Just cache drop overhead
+            total = copy_done - start
+        else:
+            # Direct mode (default): Load directly into memory
+            # This is faster for multi-threaded workloads (avoids memory allocation contention)
+            data = np.load(path, mmap_mode=None, allow_pickle=False)
+            load_done = time.perf_counter()
+
+            device_time = load_done - pre_load  # Disk I/O + deserialization time
+            host_time = pre_load - start  # Just cache drop overhead
+            total = load_done - start
+
         return data, StorageBackend.IOTiming(total=total, device=device_time, host=host_time)
 
     def delete(self, key: str):

@@ -48,6 +48,7 @@ def __init__(self,
                  cpu_memory_gb: float,
                  duration_seconds: int,
                  cache_dir: str = None,
+                 use_mmap: bool = False,
                  enable_autoscaling: bool = False,
                  autoscaler_mode: str = 'qos',
                  target_saturation: float = 0.8,
@@ -125,6 +126,7 @@ def __init__(self,
             gpu_memory_gb=gpu_memory_gb,
             cpu_memory_gb=cpu_memory_gb,
             cache_dir=cache_dir,
+            use_mmap=use_mmap,
             performance_profile=performance_profile,
             seed=seed,
             max_concurrent_allocs=max_concurrent_allocs,

@@ -211,6 +211,7 @@ def __init__(self,
                  gpu_memory_gb: float,
                  cpu_memory_gb: float,
                  cache_dir: str = None,
+                 use_mmap: bool = False,
                  eviction_policy: str = 'lru',
                  performance_profile: str = 'latency',
                  seed: Optional[int] = None,
@@ -237,7 +238,7 @@ def __init__(self,
             logger.warning(f"Could not initialize GPU backend: {e}")
 
         self.backends['cpu'] = CPUMemoryBackend()
-        self.backends['nvme'] = NVMeBackend(base_path=cache_dir)
+        self.backends['nvme'] = NVMeBackend(base_path=cache_dir, use_mmap=use_mmap)
 
         self.generator = KVCacheGenerator(model_config, global_seed=self.seed)
 

@@ -244,6 +244,9 @@ def main():
                         help='The amount of CPU memory (RAM) to allocate for the cache in GB.')
     parser.add_argument('--cache-dir', type=str, default=None,
                         help='The directory to use for the NVMe cache tier.')
+    parser.add_argument('--use-mmap', action='store_true',
+                        help='Use memory-mapped loading (np.load(mmap_mode="r") + np.array()). '
+                             'Default is False (direct loading with np.load() only), which is faster for multi-threaded workloads.')
     parser.add_argument('--generation-mode', type=str, default='realistic', choices=[g.value for g in GenerationMode],
                         help='The token generation speed simulation mode.')
     parser.add_argument('--performance-profile', type=str, default='latency', choices=['latency', 'throughput'],
@@ -352,6 +355,7 @@ def main():
         cpu_memory_gb=args.cpu_mem_gb,
         duration_seconds=args.duration,
         cache_dir=args.cache_dir,
+        use_mmap=args.use_mmap,
         enable_autoscaling=args.enable_autoscaling,
         autoscaler_mode=args.autoscaler_mode,
         target_saturation=args.target_saturation,