Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
2809044
feat: move to a class based API
ilan-gold Jan 13, 2026
94ed2ae
fix: update docs
ilan-gold Jan 13, 2026
5d8c640
fix: readme
ilan-gold Jan 13, 2026
2aa6b75
fix: docs
ilan-gold Jan 13, 2026
11c7f41
fix: docs env
ilan-gold Jan 13, 2026
03dd3dd
feat: add explicit shuffle size
ilan-gold Jan 14, 2026
df6365d
chore: `PreShuffledCollection` -> `Collection`
ilan-gold Jan 14, 2026
3e7fc19
chore: move to one `add` function
ilan-gold Jan 14, 2026
61a692f
feat: add `Loader` API
ilan-gold Jan 14, 2026
a8b3cb9
fix: `Collection` is part of the `collections` built-in lib
ilan-gold Jan 14, 2026
f6141c8
fix: try getting directly from colletion
ilan-gold Jan 14, 2026
e02a424
chore: use collection iteration
ilan-gold Jan 14, 2026
34ba111
fix: forward ref
ilan-gold Jan 14, 2026
80ce3a1
fix: notebook
ilan-gold Jan 14, 2026
a15e799
fix: dont warn for keys that are ignored by custom loading functions
ilan-gold Jan 14, 2026
6eee41e
fix: remove now-useless test
ilan-gold Jan 14, 2026
8873390
fix: handle dataframes in obs + varm
ilan-gold Jan 15, 2026
471690c
fix: try bumping python
ilan-gold Jan 15, 2026
578a811
fix: bound sphinx
ilan-gold Jan 15, 2026
5c6b670
fix: docs
ilan-gold Jan 15, 2026
b70f8d6
fix: on-disk encoding
ilan-gold Jan 15, 2026
5288056
fix: more docs fixes
ilan-gold Jan 15, 2026
82a4330
fix: `is_empty`
ilan-gold Jan 15, 2026
00c19e0
fix: no doc string for `obs_names_make_unique`
ilan-gold Jan 15, 2026
21fac27
fix: intersphinx
ilan-gold Jan 15, 2026
7b882b5
fix: more docs
ilan-gold Jan 15, 2026
ee912fd
fix: api changes according to felix
ilan-gold Jan 15, 2026
23aeaf9
fix: add test
ilan-gold Jan 15, 2026
0302416
fix: `_create_chunks_for_shuffling` bug
ilan-gold Jan 15, 2026
b25657e
fix: actually shuffle chunks
ilan-gold Jan 15, 2026
e03c708
fix: key sorting
ilan-gold Jan 15, 2026
19d165b
fix: remove parameters in default test
ilan-gold Jan 15, 2026
e753c9e
fix: `<=` for checking
ilan-gold Jan 15, 2026
e29e2b3
fix: remove printing
ilan-gold Jan 15, 2026
1759c9a
fix: ensure datasets are properly exhausted when adding
ilan-gold Jan 15, 2026
29d71e9
fix: add error
ilan-gold Jan 15, 2026
5641610
fix: use pandas index for categoricals
ilan-gold Jan 15, 2026
535a218
fix: doc fixes +`slice_size` -> `chunk_size`
ilan-gold Jan 16, 2026
b8b9601
chore: stronger shuffle bounds
ilan-gold Jan 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: 2
build:
os: ubuntu-24.04
tools:
python: "3.12"
python: "3.14"
jobs:
create_environment:
- asdf plugin add uv
Expand Down
25 changes: 12 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ For a detailed tutorial, please see the [in-depth section of our docs][]
Basic preprocessing:

```python
from annbatch import create_anndata_collection
from annbatch import DatasetCollection

import zarr
from pathlib import Path
Expand All @@ -82,13 +82,14 @@ zarr.config.set(
{"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"}
)

create_anndata_collection(
# Create a collection at the given path. The subgroups will all be anndata stores.
collection = DatasetCollection("path/to/output/collection.zarr")
collection.add_adatas(
adata_paths=[
"path/to/your/file1.h5ad",
"path/to/your/file2.h5ad"
],
output_path="path/to/output/collection", # a directory containing `dataset_{i}.zarr`
shuffle=True, # shuffling is needed if you want to use chunked access
shuffle=True, # shuffling is needed if you want to use chunked access, but is the default
)
```

Expand All @@ -107,22 +108,20 @@ zarr.config.set(
{"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"}
)

def custom_load_func(g: zarr.Group) -> ad.AnnData:
return ad.AnnData(X=ad.io.sparse_dataset(g["layers"]["counts"]), obs=ad.io.read_elem(g["obs"])[some_subset_of_columns])

# This settings override ensures that you don't lose/alter your categorical codes when reading the data in!
with ad.settings.override(remove_unused_categories=False):
ds = Loader(
batch_size=4096,
chunk_size=32,
preload_nchunks=256,
).add_anndatas(
[
ad.AnnData(
# note that you can open an AnnData file using any type of zarr store
X=ad.io.sparse_dataset(zarr.open(p)["X"]),
obs=ad.io.read_elem(zarr.open(p)["obs"]),
)
for p in Path("path/to/output/collection").glob("*.zarr")
]
)
# `use_collection` automatically uses the on-disk `X` and full `obs` in the `Loader`
# but the `load_adata` arg can override this behavior
# (see `custom_load_func` above for an example of customization).
ds = ds.use_collection(collection)

# Iterate over dataloader (plugin replacement for torch.utils.DataLoader)
for batch in ds:
Expand Down
3 changes: 1 addition & 2 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@
:toctree: generated/

write_sharded
add_to_collection
create_anndata_collection
DatasetCollection
```

(types)=
Expand Down
15 changes: 3 additions & 12 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@ Let's go through the above example:
### Preprocessing

```python
create_anndata_collection(
colleciton = DatasetCollection("path/to/output/store.zarr").add_adatas(
adata_paths=[
"path/to/your/file1.h5ad",
"path/to/your/file2.h5ad"
],
output_path="path/to/output/store", # a directory containing `chunk_{i}.zarr`
shuffle=True, # shuffling is needed if you want to use chunked access
)
```
Expand All @@ -33,20 +32,12 @@ See the [zarr docs on sharding][] for more information.
#### Chunked access

```python
# `use_collection` will automatically get everything in `X` and `obs` and yield it.
ds = Loader(
batch_size=4096,
chunk_size=32,
preload_nchunks=256,
).add_anndatas(
[
ad.AnnData(
# note that you can open an anndata file using any type of zarr store
X=ad.io.sparse_dataset(zarr.open(p)["X"]),
obs=ad.io.read_elem(zarr.open(p)["obs"]),
)
for p in PATH_TO_STORE.glob("*.zarr")
]
)
).use_collection(collection)

# Iterate over dataloader (plugin replacement for torch.utils.DataLoader)
for batch in ds:
Expand Down
173 changes: 79 additions & 94 deletions docs/notebooks/example.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ optional-dependencies.doc = [
"myst-nb>=1.1",
"pandas",
"scanpydoc[theme,typehints]>=0.15.3",
"sphinx>=8.1",
# https://github.com/sphinx-toolbox/sphinx-toolbox/issues/201
"sphinx>=8.1,<=8.2.3",
"sphinx-autodoc-typehints",
"sphinx-book-theme>=1",
"sphinx-copybutton",
Expand Down
4 changes: 2 additions & 2 deletions src/annbatch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from importlib.metadata import version

from . import types
from .io import add_to_collection, create_anndata_collection, write_sharded
from .io import DatasetCollection, write_sharded
from .loader import Loader

__version__ = version("annbatch")

__all__ = ["Loader", "write_sharded", "add_to_collection", "create_anndata_collection", "types"]
__all__ = ["Loader", "write_sharded", "DatasetCollection", "types"]
Loading
Loading