scverse · ilan-gold · Jan 16, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -3,7 +3,7 @@ version: 2
 build:
   os: ubuntu-24.04
   tools:
-    python: "3.12"
+    python: "3.14"
   jobs:
     create_environment:
       - asdf plugin add uv

diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ For a detailed tutorial, please see the [in-depth section of our docs][]
 Basic preprocessing:
 
 ```python
-from annbatch import create_anndata_collection
+from annbatch import DatasetCollection
 
 import zarr
 from pathlib import Path
@@ -82,13 +82,14 @@ zarr.config.set(
     {"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"}
 )
 
-create_anndata_collection(
+# Create a collection at the given path. The subgroups will all be anndata stores.
+collection = DatasetCollection("path/to/output/collection.zarr")
+collection.add_adatas(
     adata_paths=[
         "path/to/your/file1.h5ad",
         "path/to/your/file2.h5ad"
     ],
-    output_path="path/to/output/collection",  # a directory containing `dataset_{i}.zarr`
-    shuffle=True,  # shuffling is needed if you want to use chunked access
+    shuffle=True,  # shuffling is needed if you want to use chunked access, but is the default
 )
 ```
 
@@ -107,22 +108,20 @@ zarr.config.set(
     {"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"}
 )
 
+def custom_load_func(g: zarr.Group) -> ad.AnnData:
+    return ad.AnnData(X=ad.io.sparse_dataset(g["layers"]["counts"]), obs=ad.io.read_elem(g["obs"])[some_subset_of_columns])
+
 # This settings override ensures that you don't lose/alter your categorical codes when reading the data in!
 with ad.settings.override(remove_unused_categories=False):
     ds = Loader(
         batch_size=4096,
         chunk_size=32,
         preload_nchunks=256,
-    ).add_anndatas(
-        [
-            ad.AnnData(
-                # note that you can open an AnnData file using any type of zarr store
-                X=ad.io.sparse_dataset(zarr.open(p)["X"]),
-                obs=ad.io.read_elem(zarr.open(p)["obs"]),
-            )
-            for p in Path("path/to/output/collection").glob("*.zarr")
-        ]
     )
+    # `use_collection` automatically uses the on-disk `X` and full `obs` in the `Loader`
+    # but the `load_adata` arg can override this behavior
+    # (see `custom_load_func` above for an example of customization).
+    ds = ds.use_collection(collection)
 
 # Iterate over dataloader (plugin replacement for torch.utils.DataLoader)
 for batch in ds:

diff --git a/docs/api.md b/docs/api.md
@@ -25,8 +25,7 @@
    :toctree: generated/
 
     write_sharded
-    add_to_collection
-    create_anndata_collection
+    DatasetCollection
 ```
 
 (types)=

diff --git a/docs/index.md b/docs/index.md
@@ -9,12 +9,11 @@ Let's go through the above example:
 ### Preprocessing
 
 ```python
-create_anndata_collection(
+colleciton = DatasetCollection("path/to/output/store.zarr").add_adatas(
     adata_paths=[
         "path/to/your/file1.h5ad",
         "path/to/your/file2.h5ad"
     ],
-    output_path="path/to/output/store",  # a directory containing `chunk_{i}.zarr`
     shuffle=True,  # shuffling is needed if you want to use chunked access
 )
 ```
@@ -33,20 +32,12 @@ See the [zarr docs on sharding][] for more information.
 #### Chunked access
 
 ```python
+# `use_collection` will automatically get everything in `X` and `obs` and yield it.
 ds = Loader(
     batch_size=4096,
     chunk_size=32,
     preload_nchunks=256,
-).add_anndatas(
-    [
-        ad.AnnData(
-            # note that you can open an anndata file using any type of zarr store
-            X=ad.io.sparse_dataset(zarr.open(p)["X"]),
-            obs=ad.io.read_elem(zarr.open(p)["obs"]),
-        )
-        for p in PATH_TO_STORE.glob("*.zarr")
-    ]
-)
+).use_collection(collection)
 
 # Iterate over dataloader (plugin replacement for torch.utils.DataLoader)
 for batch in ds:

diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,8 @@ optional-dependencies.doc = [
   "myst-nb>=1.1",
   "pandas",
   "scanpydoc[theme,typehints]>=0.15.3",
-  "sphinx>=8.1",
+  # https://github.com/sphinx-toolbox/sphinx-toolbox/issues/201
+  "sphinx>=8.1,<=8.2.3",
   "sphinx-autodoc-typehints",
   "sphinx-book-theme>=1",
   "sphinx-copybutton",

diff --git a/src/annbatch/__init__.py b/src/annbatch/__init__.py
@@ -3,9 +3,9 @@
 from importlib.metadata import version
 
 from . import types
-from .io import add_to_collection, create_anndata_collection, write_sharded
+from .io import DatasetCollection, write_sharded
 from .loader import Loader
 
 __version__ = version("annbatch")
 
-__all__ = ["Loader", "write_sharded", "add_to_collection", "create_anndata_collection", "types"]
+__all__ = ["Loader", "write_sharded", "DatasetCollection", "types"]