Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 21 additions & 56 deletions benchmarks/01_size_scaling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,26 @@
"cell_type": "markdown",
"id": "9b315ef7",
"metadata": {},
"source": [
"# Size scaling — point cloud\n",
"\n",
"Write/read/disk-size of point clouds at increasing `N`. Same\n",
"`chunk_shape` across runs so the only variable is vertex count.\n",
"\n",
"Runtime: a few minutes on a laptop (the 1M case dominates)."
]
"source": "# Size scaling — point cloud\n\nWrite/read/disk-size of point clouds at increasing `N`, with **CSV as a\nbaseline** for context. Same `chunk_shape` across runs so the only\nvariable is vertex count.\n\nFor each `N` we measure:\n\n| Operation | zarr-vectors | CSV (baseline) |\n| --- | --- | --- |\n| Write | `write_points` | `pandas.to_csv` |\n| Read all | `read_points` | `pandas.read_csv` |\n| Read one | one chunk via lazy API | `read_csv(nrows=1)` (best case) |\n| Disk size | store directory | CSV file |\n\nRuntime: a few minutes on a laptop (the 1M case dominates)."
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "b97bfc29",
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pandas'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m os, time, tempfile, shutil\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m pathlib \u001b[38;5;28;01mimport\u001b[39;00m Path\n\u001b[32m 3\u001b[39m \n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m numpy \u001b[38;5;28;01mas\u001b[39;00m np\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m pandas \u001b[38;5;28;01mas\u001b[39;00m pd\n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m matplotlib.pyplot \u001b[38;5;28;01mas\u001b[39;00m plt\n\u001b[32m 7\u001b[39m \n\u001b[32m 8\u001b[39m \n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'pandas'"
]
}
],
"source": [
"import os, time, tempfile, shutil\n",
"from pathlib import Path\n",
Expand Down Expand Up @@ -60,14 +65,7 @@
"id": "5bcdba96",
"metadata": {},
"outputs": [],
"source": [
"from zarr_vectors.types.points import write_points, read_points\n",
"\n",
"SIZES = [1_000, 10_000, 100_000, 1_000_000]\n",
"CHUNK = (200.0, 200.0, 200.0)\n",
"BIN = (50.0, 50.0, 50.0)\n",
"SEED = 0"
]
"source": "from zarr_vectors.types.points import write_points, read_points\nfrom zarr_vectors.lazy import open_zv\n\nSIZES = [1_000, 10_000, 100_000, 1_000_000]\nCHUNK = (200.0, 200.0, 200.0)\nBIN = (50.0, 50.0, 50.0)\nSEED = 0\n\n\ndef _csv_path(prefix):\n \"\"\"Fresh tempdir + CSV path.\"\"\"\n return Path(tempfile.mkdtemp(prefix=f'csvbench_{prefix}_')) / 'points.csv'\n\n\ndef _csv_write(path, positions, intensity):\n \"\"\"Baseline: write x,y,z,intensity columns to a CSV.\"\"\"\n pd.DataFrame({\n 'x': positions[:, 0],\n 'y': positions[:, 1],\n 'z': positions[:, 2],\n 'intensity': intensity,\n }).to_csv(path, index=False)\n\n\ndef _csv_read_all(path):\n \"\"\"Read every row back into memory.\"\"\"\n return pd.read_csv(path)\n\n\ndef _csv_read_one(path):\n \"\"\"Best-case single-row read: only parse the first data row.\n\n CSV has no random access, so this is the cheapest single-record\n read the format admits.\"\"\"\n return pd.read_csv(path, nrows=1)\n\n\ndef _zv_read_one(store_path):\n \"\"\"Read just one chunk's worth of vertices via the lazy API.\n\n Touches a single chunk on disk (vs. the full materialisation in\n ``read_points``).\"\"\"\n zv = open_zv(store_path)\n chunk_keys = zv[0].vertices._chunk_keys # noqa: SLF001 — minimal demo\n if not chunk_keys:\n return None\n return zv[0].vertices[chunk_keys[0]].compute()"
},
{
"cell_type": "markdown",
Expand All @@ -83,30 +81,7 @@
"id": "d0b220e0",
"metadata": {},
"outputs": [],
"source": [
"rng = np.random.default_rng(SEED)\n",
"rows = []\n",
"for n in SIZES:\n",
" positions = rng.uniform(0, 1000, (n, 3)).astype(np.float32)\n",
" intensity = rng.uniform(0, 1, n).astype(np.float32)\n",
"\n",
" store = _new_store(f'size_{n}')\n",
" t_write, _ = _time(\n",
" write_points, store, positions,\n",
" chunk_shape=CHUNK, bin_shape=BIN,\n",
" attributes={'intensity': intensity},\n",
" )\n",
" t_read, _ = _time(read_points, store, attribute_names=['intensity'])\n",
" rows.append({\n",
" 'N': n,\n",
" 'write_s': round(t_write, 3),\n",
" 'read_s': round(t_read, 3),\n",
" 'size_MB': round(_store_bytes(store) / 1e6, 2),\n",
" })\n",
" shutil.rmtree(Path(store).parent, ignore_errors=True)\n",
"\n",
"df = pd.DataFrame(rows)"
]
"source": "rng = np.random.default_rng(SEED)\nrows = []\nfor n in SIZES:\n positions = rng.uniform(0, 1000, (n, 3)).astype(np.float32)\n intensity = rng.uniform(0, 1, n).astype(np.float32)\n\n # ---- ZV ----\n store = _new_store(f'size_{n}')\n t_zv_write, _ = _time(\n write_points, store, positions,\n chunk_shape=CHUNK, bin_shape=BIN,\n attributes={'intensity': intensity},\n )\n t_zv_read_all, _ = _time(read_points, store, attribute_names=['intensity'])\n t_zv_read_one, _ = _time(_zv_read_one, store)\n size_zv_MB = _store_bytes(store) / 1e6\n\n # ---- CSV baseline ----\n csv = _csv_path(f'size_{n}')\n t_csv_write, _ = _time(_csv_write, csv, positions, intensity)\n t_csv_read_all, _ = _time(_csv_read_all, csv)\n t_csv_read_one, _ = _time(_csv_read_one, csv)\n size_csv_MB = csv.stat().st_size / 1e6\n\n rows.append({\n 'N': n,\n 'zv_write_s': round(t_zv_write, 4),\n 'csv_write_s': round(t_csv_write, 4),\n 'zv_read_all_s': round(t_zv_read_all, 4),\n 'csv_read_all_s':round(t_csv_read_all,4),\n 'zv_read_one_s': round(t_zv_read_one, 4),\n 'csv_read_one_s':round(t_csv_read_one,4),\n 'zv_size_MB': round(size_zv_MB, 2),\n 'csv_size_MB': round(size_csv_MB, 2),\n })\n\n shutil.rmtree(Path(store).parent, ignore_errors=True)\n shutil.rmtree(csv.parent, ignore_errors=True)\n\ndf = pd.DataFrame(rows)"
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -140,22 +115,12 @@
"id": "6ca88043",
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(6, 4))\n",
"ax.loglog(df['N'], df['write_s'], 'o-', label='write (s)')\n",
"ax.loglog(df['N'], df['read_s'], 's-', label='read (s)')\n",
"ax.loglog(df['N'], df['size_MB'], '^-', label='size (MB)')\n",
"ax.set_xlabel('N (vertices)')\n",
"ax.set_title('Point cloud: write/read time + disk footprint vs N')\n",
"ax.legend()\n",
"ax.grid(True, which='both', alpha=0.3)\n",
"plt.tight_layout()"
]
"source": "fig, axes = plt.subplots(1, 4, figsize=(20, 4.5), sharex=True)\n\npanels = [\n ('Write time', 'write_s', 'zv_write_s', 'csv_write_s', 's'),\n ('Read all', 'read_all_s', 'zv_read_all_s', 'csv_read_all_s', 's'),\n ('Read one', 'read_one_s', 'zv_read_one_s', 'csv_read_one_s', 's'),\n ('Disk size', 'size_MB', 'zv_size_MB', 'csv_size_MB', 'MB'),\n]\nfor ax, (title, _key, zv_col, csv_col, unit) in zip(axes, panels):\n ax.loglog(df['N'], df[zv_col], 'o-', label='zarr-vectors', color='tab:blue')\n ax.loglog(df['N'], df[csv_col], 's-', label='csv', color='tab:orange')\n ax.set_title(title)\n ax.set_xlabel('N (vertices)')\n ax.set_ylabel(unit)\n ax.grid(True, which='both', alpha=0.3)\n ax.legend()\n\nfig.suptitle('zarr-vectors vs CSV — point cloud scaling', y=1.02)\nplt.tight_layout()"
}
],
"metadata": {
"kernelspec": {
"display_name": "zarr-vectors",
"display_name": ".venv (3.13.13)",
"language": "python",
"name": "python3"
},
Expand All @@ -169,9 +134,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.15"
"version": "3.13.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
}
2 changes: 1 addition & 1 deletion docs/spec/chunking/attribute_chunking.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ a bin index via the stored `chunk_attribute_values` list, and the chunk
scan is restricted to keys with that leading coord. Unknown values
yield an empty result rather than an error.

Lazy readers (`ZVRLevel`) expose:
Lazy readers (`ZVLevel`) expose:

- `chunk_dims` — the level's chunk-axis names, or `None` for legacy.
- `chunk_attribute_name` — the leading-axis attribute name.
Expand Down
13 changes: 7 additions & 6 deletions docs/spec/foundations/store_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
## Introduction

ZV stores are backend-agnostic: the same `create_store` / `open_store` /
`open_zvr` calls work whether the data lives on a local SSD, a ZIP
`open_zv` calls work whether the data lives on a local SSD, a ZIP
archive, an in-memory dict, or a cloud object store. The backing store
type affects performance characteristics (latency, throughput, cost per
request) but not the data model or the semantics of any operation.
Expand Down Expand Up @@ -69,11 +69,12 @@ All three entry points accept `backend=` and `**backend_kwargs`:

```python
from zarr_vectors.core.store import create_store, open_store
from zarr_vectors.lazy import open_zvr
from zarr_vectors.lazy import open_zv

create_store(path, root_metadata, *, backend=None, **backend_kwargs) -> Group
create_store(path, *, bounds=None, chunk_shape=None, axes=None,
geometry_types=None, ..., backend=None, **backend_kwargs) -> Group
open_store(path, mode="r", *, backend=None, **backend_kwargs) -> Group
open_zvr(path, *, backend=None, **backend_kwargs) -> ZVRStore
open_zv(path, *, backend=None, **backend_kwargs) -> ZVStore
```

`backend` is one of `"local"` / `"obstore"` / `"fsspec"` or `None` for
Expand Down Expand Up @@ -301,5 +302,5 @@ a resolution level).
The backend layer is independent of the
[format capability tokens](../layout/root_metadata.md) stamped on
`RootMetadata.format_capabilities` — backends carry data bytes, not
format semantics. See the capability list for `CAP_CROSS_CHUNK_FACES`,
`CAP_VERTEX_COUNT_CACHE`, `CAP_MULTISCALE_LINKS`, etc.
format semantics. See the capability list for `CAP_MULTISCALE_LINKS`,
`CAP_PRESERVED_OBJECT_IDS`, `CAP_SHARED_VERTEX_GROUPS`.
3 changes: 2 additions & 1 deletion docs/spec/multiscale/pyramid_construction.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ which:

1. Walks every adjacent `(fine, coarse)` level pair.
2. Reconstructs the fine→parent map from the coarse level's
`metanode_children` sidecar.
`cross_chunk_links/<delta=-1>/` records (each record pairs a
coarse metanode to one of its fine children).
3. Builds the trivial edge list `[(i, parent[i]) for i in range(n_fine)]`.
4. Partitions via
[`partition_cross_level_edges`](../../../zarr_vectors/spatial/boundary.py)
Expand Down
40 changes: 22 additions & 18 deletions docs/spec/object_model/cross_chunk_links.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,28 +126,30 @@ chunk at the **owning level**, and column 1 is a local vertex index in
the **same chunk key** at level `owning_level + N`. The reader doesn't
need any cross-chunk-coords information — both sides share `<chunk_key>`.

**Paired vertex-group offsets:** for `delta == 0` only, the byte
offset of each link group is paired into the matching
`vertex_group_offsets/<chunk_key>` table so a reader can fetch one
vertex group's edges without rescanning the chunk. For `delta != 0`
the source vertex groups and link groups belong to different levels,
so the pairing is meaningless and the writer skips it. See the
guardrail in
[`write_chunk_links`](../../../zarr_vectors/core/arrays.py).
**Self-describing blob.** Each `links/<delta>/<chunk_key>` file is a
self-describing ragged blob: an int64 header with `K` followed by the
`K` per-group byte offsets, then the concatenated link bytes. Readers
recover the per-vertex-group partition without consulting any sibling
table.

### `cross_chunk_links/<delta>/data` — global flat blob

Each link is `2 * (sid_ndim + 1)` int64s laid out as
Each record is `link_width * (sid_ndim + 1)` int64s laid out as
`link_width` back-to-back `(chunk_coords, vertex_idx)` endpoints:

```
[chunk_a_0, ..., chunk_a_{ndim-1}, vi_a,
chunk_b_0, ..., chunk_b_{ndim-1}, vi_b]
[chunk_0_0, ..., chunk_0_{ndim-1}, vi_0,
chunk_1_0, ..., chunk_1_{ndim-1}, vi_1,
...
chunk_{L-1}_0, ..., vi_{L-1}]
```

— i.e. the two endpoints written back-to-back. `chunk_a` is a chunk
coordinate at the **owning level**; `chunk_b` is a chunk coordinate at
the **target level** (`owning_level + level_delta`). `vi_a` and `vi_b`
are local vertex indices within their respective chunks.
`link_width=2` (the default) encodes a classic cross-chunk edge;
`link_width=3` encodes a triangle face spanning chunks (used by mesh
writers); `link_width=1` encodes a single parent→child reference for
pyramid metanode drill-down. Endpoint 0 lives at the **owning level**;
endpoints 1..L-1 live at the **target level** (`owning_level +
level_delta`).

**`.zattrs` schema** (see
[`zarr_vectors/core/arrays.py:write_cross_chunk_links`](../../../zarr_vectors/core/arrays.py)):
Expand All @@ -157,7 +159,8 @@ are local vertex indices within their respective chunks.
"zv_array": "cross_chunk_links",
"num_links": 12,
"sid_ndim": 3,
"level_delta": 1
"level_delta": 1,
"link_width": 2
}
```

Expand Down Expand Up @@ -233,8 +236,9 @@ chunk → bucket into per-chunk `(M_local, link_width)` rows for
[`_write_cross_level_edges`](../../../zarr_vectors/multiresolution/coarsen.py)
during pyramid construction. For each adjacent (fine, coarse) pair,
every fine vertex has exactly one trivial edge to its coarse parent
metanode (the parent map is reconstructed from `metanode_children`).
The edges are then partitioned via
metanode (the parent map is recovered from the coarse level's own
`cross_chunk_links/<delta=-1>/` records). The edges are then
partitioned via
[`partition_cross_level_edges`](../../../zarr_vectors/spatial/boundary.py):
chunk-aligned edges (source chunk_key == target chunk_key when
re-evaluated against the coarser grid) become rows in
Expand Down
8 changes: 4 additions & 4 deletions docs/tutorials/io/cloud_stores.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ to `fsspec` for any URL scheme it can't handle.
## Backend resolution at a glance

When you pass a cloud URL to any `read_*` / `write_*` / `open_store` /
`open_zvr` call, the backend is chosen in this order:
`open_zv` call, the backend is chosen in this order:

1. **Explicit `backend=` kwarg** — e.g. `backend="fsspec"` forces fsspec
even if obstore is installed.
Expand Down Expand Up @@ -262,9 +262,9 @@ a resolution level, writing new attributes).

```python
import numpy as np
from zarr_vectors.lazy import open_zvr
from zarr_vectors.lazy import open_zv

store = open_zvr("s3://open-neuro/scan.zarrvectors")
store = open_zv("s3://open-neuro/scan.zarrvectors")

print(store.levels) # metadata only — no chunk I/O
print(store[2].vertex_count) # one metadata request
Expand All @@ -281,7 +281,7 @@ detail = read_points(
)
```

`open_zvr` accepts the same `backend=` / `**backend_kwargs` as
`open_zv` accepts the same `backend=` / `**backend_kwargs` as
`open_store`.

---
Expand Down
30 changes: 15 additions & 15 deletions docs/tutorials/multiscale/lazy_loading.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ The ZVF read functions (`read_points`, `read_polylines`, etc.) are eager:
they fetch and return all requested data immediately. For large stores or
remote datasets, an eager read of the full store is impractical.

The **lazy API** provides a `open_zvr` object that opens the store
The **lazy API** provides a `open_zv` object that opens the store
metadata without reading any array data. Array slices are fetched on demand
— only when accessed. This is the recommended access pattern for:

Expand All @@ -20,10 +20,10 @@ metadata without reading any array data. Array slices are fetched on demand
## Opening a store lazily

```python
from zarr_vectors.lazy import open_zvr
from zarr_vectors.lazy import open_zv

# Opens metadata only — no vertex data fetched
store = open_zvr("synchrotron.zarrvectors")
store = open_zv("synchrotron.zarrvectors")

print(store.geometry_type) # "point_cloud"
print(store.spatial_dims) # 3
Expand All @@ -38,7 +38,7 @@ Opening a remote store is identical — pass an fsspec URL:

```python
import s3fs
from zarr_vectors.lazy import open_zvr
from zarr_vectors.lazy import open_zv

# 0.4+: backend layer auto-routes cloud URLs via obstore (or fsspec).
# Public access works without explicit anon=True.
Expand Down Expand Up @@ -140,7 +140,7 @@ print(f"Mean intensity over {total_count} points: {mean_intensity:.4f}")

## Lazy array access

The `open_zvr` exposes each array as a lazy `zarr.Array` that can
The `open_zv` exposes each array as a lazy `zarr.Array` that can
be sliced directly:

```python
Expand Down Expand Up @@ -178,18 +178,18 @@ print(f"{len(high_fa_ids)} high-FA streamlines")

```python
import s3fs
from zarr_vectors.lazy import open_zvr
from zarr_vectors.lazy import open_zv

store = open_zvr("s3://my-bucket/dataset/tracts.zarrvectors")
store = open_zv("s3://my-bucket/dataset/tracts.zarrvectors")
```

### GCS

```python
import gcsfs
from zarr_vectors.lazy import open_zvr
from zarr_vectors.lazy import open_zv

store = open_zvr("gs://my-bucket/tracts.zarrvectors")
store = open_zv("gs://my-bucket/tracts.zarrvectors")
```

### Performance on object stores
Expand All @@ -203,7 +203,7 @@ minimises requests by:
3. Caching decompressed chunks in an LRU cache (configurable size).

```python
store = open_zvr(
store = open_zv(
"s3://my-bucket/tracts.zarrvectors",
cache_size=256, # cache up to 256 decompressed chunks in memory
n_workers=8, # fetch up to 8 chunks in parallel
Expand All @@ -223,12 +223,12 @@ for chunk_coord, chunk_data in store.iter_chunks(level=1, prefetch=4):

---

## open_zvr API summary
## open_zv API summary

```python
from zarr_vectors.lazy import open_zvr
from zarr_vectors.lazy import open_zv

store = open_zvr(path_or_store)
store = open_zv(path_or_store)

# Metadata (no data I/O)
store.geometry_type # str
Expand Down Expand Up @@ -258,7 +258,7 @@ store.__enter__() / store.__exit__() # context manager
### Using as a context manager

```python
with open_zvr("scan.zarrvectors") as store:
with open_zv("scan.zarrvectors") as store:
result = store.read(level=2)
# Store is closed and cache is freed on exit
```
Expand All @@ -272,7 +272,7 @@ with open_zvr("scan.zarrvectors") as store:
Load the coarsest level for a quick full-volume thumbnail:

```python
store = open_zvr("scan.zarrvectors")
store = open_zv("scan.zarrvectors")
coarsest = store.levels[-1]
result = store.read(level=coarsest)
# Use result["positions"] to render a low-density overview
Expand Down
Loading
Loading