Fix the determinism issue by introducing a pool of readers/tokenizers usable by intra-shard threads.

ArrayRecord Team · copybara-github · commit 6af685383a20 · 2026-04-15T00:51:52.000-07:00
This pool has a fixed size when underutilized, but will scale if the demand for them  spikes (i.e. the pool is empty). This keeps a reasonable predictable memory footprint, while allowing it to still respond to large increases in demand, although the readers can have non-trivial initialization time.

PiperOrigin-RevId: 897665704
diff --git a/python/BUILD b/python/BUILD
@@ -3,6 +3,7 @@
 load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@pypi//:requirements.bzl", "requirement")
 
+
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])
diff --git a/python/array_record_data_source.py b/python/array_record_data_source.py
@@ -38,6 +38,7 @@ def __getitem__(self, record_keys: Sequence[int]) -> Sequence[T]:
 import itertools
 import os
 import pathlib
+import queue
 import re
 import typing
 from typing import Any, Callable, Iterator, List, Mapping, Protocol, Sequence, SupportsIndex, Tuple, TypeVar, Union
@@ -221,6 +222,7 @@ def __init__(
           PathLikeOrFileInstruction, Sequence[PathLikeOrFileInstruction]
       ],
       reader_options: dict[str, str] | None = None,
+      reader_pool_size: int = 1,
   ):
     """Creates a new ArrayRecordDataSource object.
 
@@ -242,6 +244,8 @@ def __init__(
         initialization faster.
       reader_options: string of comma-separated options to be passed when
         creating a reader.
+      reader_pool_size: Number of readers to pre-allocate in the pool for each
+        shard. Default is 1.
     """
     if isinstance(paths, (str, pathlib.Path, FileInstruction)):
       paths = [paths]
@@ -270,8 +274,20 @@ def __init__(
       )
     self._read_instructions = _get_read_instructions(paths)
     self._paths = [ri.filename for ri in self._read_instructions]
-    # We open readers lazily when we need to read from them.
-    self._readers = [None] * len(self._read_instructions)
+    self._reader_pool_size = max(reader_pool_size, 1)
+    # We maintain a pool of readers for each shard to ensure thread safety
+    # while allowing concurrent reads.
+    self._reader_pools = [
+        queue.LifoQueue(maxsize=self._reader_pool_size)
+        for _ in self._read_instructions
+    ]
+    # We pre-populate the pools sequentially to avoid I/O storms at startup.
+    for i, ri in enumerate(self._read_instructions):
+      for _ in range(self._reader_pool_size):
+        reader = _create_reader(ri.filename, self._reader_options_string)
+        _check_group_size(ri.filename, reader)
+        self._reader_pools[i].put(reader)
+
     self._num_records = sum(
         map(lambda x: x.num_records, self._read_instructions)
     )
@@ -286,10 +302,11 @@ def __enter__(self):
 
   def __exit__(self, exc_type, exc_value, traceback):
     logging.debug("__exit__ for ArrayRecordDataSource is called.")
-    for reader in self._readers:
-      if reader:
-        reader.close()
-    self._readers = [None] * len(self._read_instructions)
+    for pool in self._reader_pools:
+      while not pool.empty():
+        reader = pool.get()
+        if reader:
+          reader.close()
 
   def __len__(self) -> int:
     return self._num_records
@@ -329,21 +346,33 @@ def _split_keys_per_reader(
         positions_and_indices[reader_idx] = [(position, idx)]
     return positions_and_indices
 
-  def _ensure_reader_exists(self, reader_idx: int) -> None:
-    """Threadsafe method to create corresponding reader if it doesn't exist."""
-    if self._readers[reader_idx] is not None:
-      return
-    filename = self._read_instructions[reader_idx].filename
-    reader = _create_reader(filename, self._reader_options_string)
-    _check_group_size(filename, reader)
-    self._readers[reader_idx] = reader
+  def _get_reader(self, reader_idx: int) -> Any:
+    """Gets a reader from the pool or creates a new one."""
+    try:
+      return self._reader_pools[reader_idx].get_nowait()
+    except queue.Empty:
+      filename = self._read_instructions[reader_idx].filename
+      reader = _create_reader(filename, self._reader_options_string)
+      _check_group_size(filename, reader)
+      return reader
+
+  def _release_reader(self, reader_idx: int, reader: Any) -> None:
+    """Returns a reader to the pool."""
+    try:
+      self._reader_pools[reader_idx].put_nowait(reader)
+    except queue.Full:
+      if reader:
+        reader.close()
 
   def __getitem__(self, record_key: SupportsIndex) -> bytes:
     reader_idx, position = self._reader_idx_and_position(record_key)
-    self._ensure_reader_exists(reader_idx)
-    if hasattr(self._readers[reader_idx], "read"):
-      return self._readers[reader_idx].read([position])[0]
-    return self._readers[reader_idx][position]
+    reader = self._get_reader(reader_idx)
+    try:
+      if hasattr(reader, "read"):
+        return reader.read([position])[0]
+      return reader[position]
+    finally:
+      self._release_reader(reader_idx, reader)
 
   def __getitems__(
       self, record_keys: Sequence[SupportsIndex]
@@ -352,14 +381,16 @@ def read_records(
         reader_idx: int, reader_positions_and_indices: Sequence[Tuple[int, int]]
     ) -> Sequence[Tuple[Any, int]]:
       """Reads records using the given reader keeping track of the indices."""
-      # Initialize readers lazily when we need to read from them.
-      self._ensure_reader_exists(reader_idx)
-      positions, indices = list(zip(*reader_positions_and_indices))
-      if hasattr(self._readers[reader_idx], "read"):
-        records = self._readers[reader_idx].read(positions)  # pytype: disable=attribute-error
-      else:
-        records = [self._readers[reader_idx][p] for p in positions]
-      return list(zip(records, indices))
+      reader = self._get_reader(reader_idx)
+      try:
+        positions, indices = list(zip(*reader_positions_and_indices))
+        if hasattr(reader, "read"):
+          records = reader.read(positions)  # pytype: disable=attribute-error
+        else:
+          records = [reader[p] for p in positions]
+        return list(zip(records, indices))
+      finally:
+        self._release_reader(reader_idx, reader)
 
     positions_and_indices = self._split_keys_per_reader(record_keys)
     num_threads = _get_flag_value(_GRAIN_NUM_THREADS_FETCHING_RECORDS)
@@ -390,15 +421,31 @@ def read_records(
   def __getstate__(self):
     logging.debug("__getstate__ for ArrayRecordDataSource is called.")
     state = self.__dict__.copy()
-    del state["_readers"]
+    state.pop("_reader_pools", None)
     return state
 
   def __setstate__(self, state):
     logging.debug("__setstate__ for ArrayRecordDataSource is called.")
     self.__dict__.update(state)
-    # We open readers lazily when we need to read from them. Thus, we don't
-    # need to re-open the same files as before pickling.
-    self._readers = [None] * len(self._read_instructions)
+    # After pickling, we re-initialize the reader pools.
+    self._reader_pools = [
+        queue.LifoQueue(maxsize=self._reader_pool_size)
+        for _ in self._read_instructions
+    ]
+    # We pre-populate the pools again in the new process.
+    for i, ri in enumerate(self._read_instructions):
+      for _ in range(self._reader_pool_size):
+        reader = _create_reader(ri.filename, self._reader_options_string)
+        _check_group_size(ri.filename, reader)
+        self._reader_pools[i].put(reader)
+
+  def _peek_readers(self) -> List[Any]:
+    """Returns a list of readers (one per shard) or None (for testing)."""
+    readers = []
+    for pool in self._reader_pools:
+      with pool.mutex:
+        readers.append(pool.queue[-1] if pool.queue else None)
+    return readers
 
   def __repr__(self) -> str:
     """Storing a hash of paths since paths can be a very long list."""
diff --git a/python/array_record_data_source_test.py b/python/array_record_data_source_test.py
@@ -109,7 +109,7 @@ def test_array_record_data_source_single_path(self):
     ) as ar:
       actual_data = [ar[x] for x in indices_to_read]
     self.assertEqual(expected_data, actual_data)
-    self.assertTrue(all(reader is None for reader in ar._readers))
+    self.assertTrue(all(reader is None for reader in ar._peek_readers()))
 
   def test_array_record_data_source_string_read_instructions(self):
     indices_to_read = [0, 1, 2, 3, 4]
@@ -132,7 +132,7 @@ def test_array_record_data_source_reverse_order(self):
     ]) as ar:
       actual_data = [ar[x] for x in indices_to_read]
     self.assertEqual(expected_data, actual_data)
-    self.assertTrue(all(reader is None for reader in ar._readers))
+    self.assertTrue(all(reader is None for reader in ar._peek_readers()))
 
   def test_array_record_data_source_random_order(self):
     # some random permutation
@@ -144,7 +144,7 @@ def test_array_record_data_source_random_order(self):
     ]) as ar:
       actual_data = [ar[x] for x in indices_to_read]
     self.assertEqual(expected_data, actual_data)
-    self.assertTrue(all(reader is None for reader in ar._readers))
+    self.assertTrue(all(reader is None for reader in ar._peek_readers()))
 
   def test_array_record_data_source_random_order_batched(self):
     # some random permutation
@@ -156,7 +156,7 @@ def test_array_record_data_source_random_order_batched(self):
     ]) as ar:
       actual_data = ar.__getitems__(indices_to_read)
     self.assertEqual(expected_data, actual_data)
-    self.assertTrue(all(reader is None for reader in ar._readers))
+    self.assertTrue(all(reader is None for reader in ar._peek_readers()))
 
   def test_array_record_data_source_file_instructions(self):
     file_instruction_one = DummyFileInstruction(
@@ -187,7 +187,7 @@ def test_array_record_data_source_file_instructions(self):
       actual_data = [ar[x] for x in indices_to_read]
 
     self.assertEqual(expected_data, actual_data)
-    self.assertTrue(all(reader is None for reader in ar._readers))
+    self.assertTrue(all(reader is None for reader in ar._peek_readers()))
 
   def test_array_record_source_reader_idx_and_position(self):
     file_instructions = [