From e8da9ff02c6bdd02a7e173934e007c5bf6620913 Mon Sep 17 00:00:00 2001 From: Forrest Collman Date: Thu, 14 May 2026 22:22:33 -0700 Subject: [PATCH] initial shared fragments implementation --- schema/zarr_vectors.linkml.yaml | 34 +- tests/test_core.py | 6 +- tests/test_encoding.py | 82 --- tests/test_fragments.py | 279 ++++++++++ tests/test_linkml_schema.py | 22 +- tests/test_multiscale_links.py | 4 +- tests/test_object_index_blocks.py | 179 +++++++ tests/test_per_object_pyramid.py | 6 +- tests/test_scaffolding.py | 2 +- zarr_vectors/constants.py | 42 +- zarr_vectors/core/arrays.py | 311 +++++++++-- zarr_vectors/core/group.py | 2 +- zarr_vectors/core/metadata.py | 22 +- zarr_vectors/encoding/compression.py | 9 +- zarr_vectors/encoding/fragments.py | 682 ++++++++++++++++++++++++ zarr_vectors/encoding/ragged.py | 125 +---- zarr_vectors/lazy/level.py | 6 +- zarr_vectors/multiresolution/coarsen.py | 12 +- zarr_vectors/rechunk/rebin.py | 2 +- zarr_vectors/types/graphs.py | 4 +- zarr_vectors/types/lines.py | 4 +- zarr_vectors/types/meshes.py | 20 +- zarr_vectors/types/points.py | 6 +- zarr_vectors/types/polylines.py | 4 +- 24 files changed, 1540 insertions(+), 325 deletions(-) create mode 100644 tests/test_fragments.py create mode 100644 tests/test_object_index_blocks.py create mode 100644 zarr_vectors/encoding/fragments.py diff --git a/schema/zarr_vectors.linkml.yaml b/schema/zarr_vectors.linkml.yaml index a1e5e8c..cc2d645 100644 --- a/schema/zarr_vectors.linkml.yaml +++ b/schema/zarr_vectors.linkml.yaml @@ -109,11 +109,18 @@ enums: (``LevelMetadata.preserves_object_ids=True``). Dropped objects appear as empty manifest slots; ``parent_level`` carries semantic weight on those levels. - shared_vertex_groups: + shared_fragments: description: > - At least one level stores per-chunk vertex groups that may be - referenced by multiple objects' manifests (shared metavertices - in the per-object pyramid regime). + At least one level stores per-chunk fragments that may be + referenced by multiple objects' manifests (the v0.6 successor + to ``shared_vertex_groups``; the sharing primitive is now a + fragment rather than a contiguous-byte vertex group). + fragment_index: + description: > + The store uses the v0.6 fragment-index encoding for + ``vertex_fragments`` and ``link_fragments`` (a single uint8 + blob per chunk with a tagged-bitmap header + range table + + explicit CSR; see :mod:`zarr_vectors.encoding.fragments`). multiscale_links: description: > Store uses the 0.4 multiscale links layout @@ -129,6 +136,16 @@ enums: stamps the corresponding tag. permissible_values: vertices: + vertex_fragments: + description: > + Per-chunk fragment-index group for ``vertices/`` (v0.6+). + Replaces the v0.5 ``vertex_group_offsets``. + link_fragments: + description: > + Per-chunk fragment-index group for ``links/0/`` (v0.6+, + delta == 0 only). Splits the v0.5 inline self-describing + header into a sibling group so link bytes can be addressed + uniformly with vertex bytes. links: attribute: object_index: @@ -220,7 +237,7 @@ classes: - chunk_attribute_values - preserves_object_ids - inherited_num_objects - - shared_vertex_groups + - shared_fragments # ----- per-array .zattrs shapes (zv_array discriminator) -------------- @@ -501,11 +518,12 @@ slots: ``preserves_object_ids`` is true; absent on standalone levels. range: integer minimum_value: 0 - shared_vertex_groups: + shared_fragments: description: > - True when per-chunk vertex groups may be referenced by multiple + True when per-chunk fragments may be referenced by multiple objects' manifests (shared metavertices in the per-object pyramid - regime). Readers MAY use this to short-circuit dedup work. + regime). v0.6 successor to ``shared_vertex_groups``. Readers + MAY use this to short-circuit dedup work. range: boolean # -- per-array .zattrs ------------------------------------------------ diff --git a/tests/test_core.py b/tests/test_core.py index 34bce7b..dcb1e2d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -550,7 +550,7 @@ def test_create_store(self, tmp_store_path: Path) -> None: # The warm shell already contains the empty ragged-vertex pair. res0 = root["0"] assert "vertices" in res0 - assert "vertex_group_offsets" in res0 + assert "vertex_fragments" in res0 def test_create_store_minimal(self, tmp_store_path: Path) -> None: """create_store(path) with no kwargs produces a warm 3D shell @@ -571,7 +571,7 @@ def test_create_store_minimal(self, tmp_store_path: Path) -> None: assert "parametric" not in root res0 = root["0"] assert "vertices" in res0 - assert "vertex_group_offsets" in res0 + assert "vertex_fragments" in res0 def test_create_store_ndim_2d(self, tmp_store_path: Path) -> None: """ndim kwarg resolves to a 2D store with default 2D bounds.""" @@ -716,7 +716,7 @@ def test_basic_info(self, tmp_store_path: Path) -> None: write_parametric_types(root, [PARAMETRIC_PLANE]) info = store_info(root) - assert info["zv_version"].startswith("0.5") + assert info["zv_version"].startswith("0.6") assert info["geometry_types"] == ["point_cloud", "skeleton"] assert info["chunk_shape"] == [100.0, 100.0, 100.0] assert len(info["levels"]) == 1 diff --git a/tests/test_encoding.py b/tests/test_encoding.py index faeabcc..24c751b 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -5,16 +5,12 @@ import numpy as np from zarr_vectors.encoding.ragged import ( - decode_object_index, decode_ragged_blob, decode_ragged_ints, decode_vertex_groups, - decode_vertex_offsets, - encode_object_index, encode_ragged_blob, encode_ragged_ints, encode_vertex_groups, - encode_vertex_offsets, ) from zarr_vectors.encoding.compression import ( get_codec_pipeline, @@ -149,84 +145,6 @@ def test_empty(self) -> None: # Object index round-trips # --------------------------------------------------------------------------- -class TestObjectIndexEncoding: - - def test_basic_3d(self) -> None: - manifests = [ - [((0, 0, 0), 0), ((0, 0, 1), 2)], # object 0: 2 vertex groups - [((1, 1, 1), 0)], # object 1: 1 vertex group - [((0, 0, 0), 1), ((0, 1, 0), 0), ((1, 0, 0), 3)], # object 2: 3 vg - ] - raw, offsets = encode_object_index(manifests, sid_ndim=3) - decoded = decode_object_index(raw, offsets, sid_ndim=3) - assert len(decoded) == 3 - assert decoded[0] == manifests[0] - assert decoded[1] == manifests[1] - assert decoded[2] == manifests[2] - - def test_2d(self) -> None: - manifests = [ - [((0, 1), 0), ((1, 1), 3)], - ] - raw, offsets = encode_object_index(manifests, sid_ndim=2) - decoded = decode_object_index(raw, offsets, sid_ndim=2) - assert decoded[0] == manifests[0] - - def test_4d_xyzt(self) -> None: - manifests = [ - [((0, 0, 0, 0), 0), ((0, 0, 0, 1), 0)], - ] - raw, offsets = encode_object_index(manifests, sid_ndim=4) - decoded = decode_object_index(raw, offsets, sid_ndim=4) - assert decoded[0] == manifests[0] - - def test_empty_manifest(self) -> None: - manifests = [ - [((0, 0, 0), 0)], - [], # empty object - [((1, 1, 1), 0)], - ] - raw, offsets = encode_object_index(manifests, sid_ndim=3) - decoded = decode_object_index(raw, offsets, sid_ndim=3) - assert decoded[0] == manifests[0] - assert decoded[1] == [] - assert decoded[2] == manifests[2] - - def test_no_objects(self) -> None: - raw, offsets = encode_object_index([], sid_ndim=3) - decoded = decode_object_index(raw, offsets, sid_ndim=3) - assert decoded == [] - - def test_wrong_sid_ndim_raises(self) -> None: - manifests = [ - [((0, 0), 0)], # 2D coords - ] - try: - encode_object_index(manifests, sid_ndim=3) - assert False, "Should have raised ArrayError" - except ArrayError: - pass - - -# --------------------------------------------------------------------------- -# Vertex offsets round-trips (K×1 plain int64) -# --------------------------------------------------------------------------- - -class TestVertexOffsets: - - def test_basic(self) -> None: - v_off = np.array([0, 36, 108], dtype=np.int64) - raw = encode_vertex_offsets(v_off) - dec_v = decode_vertex_offsets(raw) - np.testing.assert_array_equal(dec_v, v_off) - - def test_empty(self) -> None: - raw = encode_vertex_offsets(np.array([], dtype=np.int64)) - assert raw == b"" - dec = decode_vertex_offsets(raw) - assert len(dec) == 0 - - # --------------------------------------------------------------------------- # Self-describing ragged blob (inline offset header) round-trips # --------------------------------------------------------------------------- diff --git a/tests/test_fragments.py b/tests/test_fragments.py new file mode 100644 index 0000000..06ad522 --- /dev/null +++ b/tests/test_fragments.py @@ -0,0 +1,279 @@ +"""Tests for the per-chunk fragment-index encoder/decoder.""" + +from __future__ import annotations + +import numpy as np +import pytest + +from zarr_vectors.encoding.fragments import ( + FRAGMENT_INDEX_MAGIC, + FRAGMENT_INDEX_VERSION, + FragmentIndex, + decode_fragments, + encode_fragments, +) +from zarr_vectors.exceptions import ArrayError + + +# --------------------------------------------------------------------------- +# Round-trip basics +# --------------------------------------------------------------------------- + + +def _roundtrip(fragments, **kwargs) -> FragmentIndex: + raw = encode_fragments(fragments, **kwargs) + return decode_fragments(raw) + + +def test_empty_chunk_header_only_sentinel() -> None: + raw = encode_fragments([]) + assert len(raw) == 16, "F=0 must be a header-only 16-byte blob" + fi = decode_fragments(raw) + assert len(fi) == 0 + assert fi.num_range_fragments == 0 + assert fi.num_explicit_fragments == 0 + + +def test_single_range() -> None: + fi = _roundtrip([(100, 5)]) + assert len(fi) == 1 + assert fi.is_range(0) is True + assert fi.range(0) == (100, 5) + assert fi.indices(0).tolist() == [100, 101, 102, 103, 104] + + +def test_single_explicit() -> None: + fi = _roundtrip([np.array([3, 7, 9, 100])]) + assert len(fi) == 1 + assert fi.is_range(0) is False + assert fi.indices(0).tolist() == [3, 7, 9, 100] + view = fi.indices_view(0) + assert view.tolist() == [3, 7, 9, 100] + assert view.flags["WRITEABLE"] is False or True # just exercise + + +def test_mixed_bitmap() -> None: + fragments = [ + (10, 5), # 0 range + np.array([100, 101, 200]), # 1 explicit (not arange because 200) + (50, 1), # 2 range + np.array([7]), # 3 explicit (single elem, but auto-detected as range arange(7,8)) + (0, 0), # 4 empty range + np.array([], dtype=np.int64), # 5 empty explicit + ] + fi = _roundtrip(fragments) + assert len(fi) == 6 + # f=3 should be auto-promoted to range (arange(7, 8) == [7]) + assert fi.is_range(0) is True and fi.range(0) == (10, 5) + assert fi.is_range(1) is False + assert fi.is_range(2) is True and fi.range(2) == (50, 1) + assert fi.is_range(3) is True and fi.range(3) == (7, 1) + assert fi.is_range(4) is True and fi.range(4) == (0, 0) + assert fi.is_range(5) is False + # Round-trip the explicit ones + assert fi.indices(1).tolist() == [100, 101, 200] + assert fi.indices(4).tolist() == [] + assert fi.indices(5).tolist() == [] + + +def test_auto_detect_long_arange() -> None: + f0 = np.arange(1000, 2000, dtype=np.int64) + fi = _roundtrip([f0]) + assert fi.is_range(0) is True + assert fi.range(0) == (1000, 1000) + # Range path footprint stays tiny regardless of N. + raw = encode_fragments([f0]) + # 16 header + 8 bitmap (padded) + 16 range row + 4 csr_offsets[0] = 44 + assert len(raw) == 16 + 8 + 16 + 4 + + +def test_force_explicit_overrides_auto_range() -> None: + raw = encode_fragments( + [np.arange(5, 15, dtype=np.int64)], force_explicit=True, + ) + fi = decode_fragments(raw) + assert fi.is_range(0) is False + assert fi.indices(0).tolist() == list(range(5, 15)) + + +def test_all_range_fast_path_byte_budget() -> None: + """The motivating use case: every fragment is a range.""" + f = 1000 + fragments = [(i * 10, 10) for i in range(f)] + raw = encode_fragments(fragments) + # 16 header + ceil(1000/8)=125, padded to 128, + 1000*16 range table + # + 4 csr_offsets (E=0 → 1 slot). Total = 16 + 128 + 16000 + 4 = 16148. + assert len(raw) == 16 + 128 + 1000 * 16 + 4 + fi = decode_fragments(raw) + assert len(fi) == f + assert fi.num_range_fragments == f + # Spot check a couple of ranges + assert fi.range(0) == (0, 10) + assert fi.range(999) == (9990, 10) + + +def test_alignment_padding_F_eq_3() -> None: + """3 fragments → 1-byte raw bitmap, padded to 8 → range table aligned.""" + fragments = [(0, 1), (1, 1), (2, 1)] + raw = encode_fragments(fragments) + # 16 + 8 (padded bitmap) + 3*16 + 4 (csr_offsets[0]) + assert len(raw) == 16 + 8 + 48 + 4 + fi = decode_fragments(raw) + assert [fi.range(i) for i in range(3)] == [(0, 1), (1, 1), (2, 1)] + + +# --------------------------------------------------------------------------- +# is_range(f) must not materialise fragments +# --------------------------------------------------------------------------- + + +class _NoMaterializeFragmentIndex(FragmentIndex): + """Subclass that explodes if .indices / .range / .indices_view are called.""" + + def __init__(self, fi: FragmentIndex) -> None: + super().__init__( + num_fragments=fi.num_fragments, + _bitmap=fi._bitmap, + _range_table=fi._range_table, + _csr_offsets=fi._csr_offsets, + _csr_indices=fi._csr_indices, + ) + + def range(self, f): # type: ignore[override] + raise AssertionError( + f"is_range({f}) decoded fragment payload (called .range)", + ) + + def indices(self, f): # type: ignore[override] + raise AssertionError( + f"is_range({f}) decoded fragment payload (called .indices)", + ) + + def indices_view(self, f): # type: ignore[override] + raise AssertionError( + f"is_range({f}) decoded fragment payload (called .indices_view)", + ) + + +def test_is_range_does_not_decode_payload() -> None: + """Stress the must-have invariant: is_range(f) is pure bit lookup.""" + fragments = [(i, 5) if i % 3 == 0 else np.array([i, i + 100, i + 200]) + for i in range(100)] + base = _roundtrip(fragments) + no_materialize = _NoMaterializeFragmentIndex(base) + # Calling is_range over the whole F must never reach .range/.indices. + for f in range(len(no_materialize)): + no_materialize.is_range(f) + + +# --------------------------------------------------------------------------- +# Random access through prefix-popcount +# --------------------------------------------------------------------------- + + +def test_random_access_uses_lazy_popcount() -> None: + f = 50 + rng = np.random.default_rng(42) + fragments = [] + for i in range(f): + if rng.random() < 0.5: + fragments.append((rng.integers(0, 1000).item(), rng.integers(1, 6).item())) + else: + fragments.append( + rng.integers(0, 1000, size=rng.integers(1, 6).item()).astype(np.int64), + ) + fi = _roundtrip(fragments) + # Visit fragments in non-sequential order; each call should give the + # same answer the encoder put in. + order = list(range(f)) + rng.shuffle(order) + for idx in order: + if fi.is_range(idx): + start, count = fi.range(idx) + assert fi.indices(idx).tolist() == list(range(start, start + count)) + else: + decoded = fi.indices(idx).tolist() + # The encoder may have auto-promoted this fragment to range if + # the random list happened to be an arange. In that branch + # is_range would have been True; here we know it's False so + # the underlying CSR must round-trip exactly. + assert decoded == list(np.asarray(fragments[idx]).astype(np.int64)) + + +# --------------------------------------------------------------------------- +# Error paths +# --------------------------------------------------------------------------- + + +def test_negative_explicit_index_rejected() -> None: + with pytest.raises(ArrayError, match="non-negative"): + encode_fragments([np.array([0, -1, 2])]) + + +def test_negative_range_count_rejected() -> None: + with pytest.raises(ArrayError, match="count must be >= 0"): + encode_fragments([(5, -1)]) + + +def test_tuple_wrong_shape_rejected() -> None: + with pytest.raises(ArrayError, match="\\(start, count\\)"): + encode_fragments([(1, 2, 3)]) # type: ignore[list-item] + + +def test_2d_fragment_rejected() -> None: + with pytest.raises(ArrayError, match="1-D"): + encode_fragments([np.zeros((2, 2), dtype=np.int64)]) + + +def test_truncated_blob_rejected() -> None: + raw = encode_fragments([(0, 5), np.array([1, 2, 3])]) + with pytest.raises(ArrayError, match="truncated"): + decode_fragments(raw[:-4]) + + +def test_bad_magic_rejected() -> None: + raw = bytearray(encode_fragments([(0, 5)])) + raw[0] = 0 + with pytest.raises(ArrayError, match="magic"): + decode_fragments(bytes(raw)) + + +def test_too_short_blob_rejected() -> None: + with pytest.raises(ArrayError, match="too short"): + decode_fragments(b"\x00\x00\x00") + + +def test_range_on_explicit_raises() -> None: + fi = _roundtrip([np.array([100, 101, 200])]) + with pytest.raises(ArrayError, match="explicit"): + fi.range(0) + + +def test_indices_view_on_range_raises() -> None: + fi = _roundtrip([(10, 5)]) + with pytest.raises(ArrayError, match="explicit"): + fi.indices_view(0) + + +def test_out_of_range_fragment_index_raises() -> None: + fi = _roundtrip([(0, 1)]) + with pytest.raises(IndexError): + fi.is_range(1) + with pytest.raises(IndexError): + fi.is_range(-1) + + +# --------------------------------------------------------------------------- +# Header constants exposed for downstream readers +# --------------------------------------------------------------------------- + + +def test_header_constants_in_blob() -> None: + import struct + raw = encode_fragments([(0, 1)]) + magic, version, flags, f, r = struct.unpack_from(" None: + raw = encode_object_manifest_blocks([], sid_ndim=3) + assert raw == b"\x00\x00\x00\x00" + blocks = decode_object_manifest_blocks(raw, sid_ndim=3) + assert blocks == [] + + +def test_single_block_single_mode() -> None: + raw = encode_object_manifest_blocks([((1, 2, 3), 7)], sid_ndim=3) + blocks = decode_object_manifest_blocks(raw, sid_ndim=3) + assert blocks == [((1, 2, 3), 7)] + + +def test_single_block_range_mode() -> None: + raw = encode_object_manifest_blocks( + [((1, 2, 3), (4, 4))], sid_ndim=3, + ) + blocks = decode_object_manifest_blocks(raw, sid_ndim=3) + assert blocks == [((1, 2, 3), (4, 4))] + + +def test_single_block_explicit_mode() -> None: + raw = encode_object_manifest_blocks( + [((1, 2, 3), np.array([2, 7]))], sid_ndim=3, + ) + blocks = decode_object_manifest_blocks(raw, sid_ndim=3) + assert len(blocks) == 1 + coords, frag_ref = blocks[0] + assert coords == (1, 2, 3) + assert isinstance(frag_ref, np.ndarray) + assert frag_ref.tolist() == [2, 7] + + +def test_multi_chunk_manifest_walkthrough() -> None: + """Mirrors the example from plan §4: object 42 spread across 3 chunks.""" + raw = encode_object_manifest_blocks( + [ + ((0, 0, 0), np.arange(3, 6)), # auto-detected range (3, 3) + ((1, 0, 0), 0), # single + ((1, 0, 1), np.array([2, 7])), # explicit + ], + sid_ndim=3, + ) + blocks = decode_object_manifest_blocks(raw, sid_ndim=3) + assert blocks[0] == ((0, 0, 0), (3, 3)) # auto-promoted to range + assert blocks[1] == ((1, 0, 0), 0) + assert blocks[2][0] == (1, 0, 1) + assert isinstance(blocks[2][1], np.ndarray) + assert blocks[2][1].tolist() == [2, 7] + + +def test_force_explicit_overrides_auto_range() -> None: + raw = encode_object_manifest_blocks( + [((0, 0, 0), np.arange(3, 8))], sid_ndim=3, force_explicit=True, + ) + blocks = decode_object_manifest_blocks(raw, sid_ndim=3) + assert isinstance(blocks[0][1], np.ndarray) + assert blocks[0][1].tolist() == [3, 4, 5, 6, 7] + + +def test_sid_ndim_2d_round_trip() -> None: + raw = encode_object_manifest_blocks( + [((0, 0), (3, 3)), ((1, 1), 0)], sid_ndim=2, + ) + blocks = decode_object_manifest_blocks(raw, sid_ndim=2) + assert blocks == [((0, 0), (3, 3)), ((1, 1), 0)] + + +# --------------------------------------------------------------------------- +# Range short-circuit really saves bytes +# --------------------------------------------------------------------------- + + +def test_range_mode_smaller_than_repeated_single_mode() -> None: + """An object owning fragments 0..99 in chunk (0,0,0) should encode + much smaller as one range block than as 100 single-mode blocks. + Mirrors the example from plan §4.""" + chunk = (0, 0, 0) + range_blob = encode_object_manifest_blocks( + [(chunk, (0, 100))], sid_ndim=3, + ) + singles_blob = encode_object_manifest_blocks( + [(chunk, i) for i in range(100)], sid_ndim=3, + ) + assert len(range_blob) < len(singles_blob) + # And quantitatively: range is one block (4 + 24 coords + 1 mode + 16 payload = 45 bytes); + # singles is 100 blocks (4 + 100*(24 + 1 + 8) = 4 + 3300 = 3304 bytes). + assert len(range_blob) == 45 + assert len(singles_blob) == 4 + 100 * (24 + 1 + 8) + + +# --------------------------------------------------------------------------- +# Fragment re-use across two objects is implicit (no on-disk cross-ref) +# --------------------------------------------------------------------------- + + +def test_two_objects_can_reference_same_chunk_local_fragment() -> None: + """Re-use is implicit: two manifests both pointing at fragment 5 in + chunk (1,2,3) are valid and decode independently.""" + obj_a = encode_object_manifest_blocks([((1, 2, 3), 5)], sid_ndim=3) + obj_b = encode_object_manifest_blocks([((1, 2, 3), 5)], sid_ndim=3) + blocks_a = decode_object_manifest_blocks(obj_a, sid_ndim=3) + blocks_b = decode_object_manifest_blocks(obj_b, sid_ndim=3) + assert blocks_a == blocks_b == [((1, 2, 3), 5)] + + +# --------------------------------------------------------------------------- +# Error paths +# --------------------------------------------------------------------------- + + +def test_wrong_chunk_coord_rank_rejected() -> None: + with pytest.raises(ArrayError, match="sid_ndim=3"): + encode_object_manifest_blocks([((1, 2), 0)], sid_ndim=3) + + +def test_negative_single_fragment_index_rejected() -> None: + with pytest.raises(ArrayError, match=">= 0"): + encode_object_manifest_blocks([((0, 0, 0), -1)], sid_ndim=3) + + +def test_negative_range_count_rejected() -> None: + with pytest.raises(ArrayError, match=">= 0"): + encode_object_manifest_blocks([((0, 0, 0), (5, -1))], sid_ndim=3) + + +def test_negative_explicit_index_rejected() -> None: + with pytest.raises(ArrayError, match="non-negative"): + encode_object_manifest_blocks( + [((0, 0, 0), np.array([0, -1, 2]))], sid_ndim=3, + ) + + +def test_truncated_manifest_rejected() -> None: + raw = encode_object_manifest_blocks( + [((1, 2, 3), 7), ((4, 5, 6), (0, 3))], sid_ndim=3, + ) + with pytest.raises(ArrayError, match="truncated"): + decode_object_manifest_blocks(raw[:-4], sid_ndim=3) + + +def test_trailing_bytes_rejected() -> None: + raw = encode_object_manifest_blocks([((1, 2, 3), 7)], sid_ndim=3) + with pytest.raises(ArrayError, match="trailing bytes"): + decode_object_manifest_blocks(raw + b"\x00\x00", sid_ndim=3) + + +def test_too_short_blob_rejected() -> None: + with pytest.raises(ArrayError, match="too short"): + decode_object_manifest_blocks(b"\x00\x00", sid_ndim=3) + + +def test_mode_constants_match_spec() -> None: + """Lock down the wire format mode tags.""" + assert MANIFEST_MODE_SINGLE == 0 + assert MANIFEST_MODE_RANGE == 1 + assert MANIFEST_MODE_EXPLICIT == 2 diff --git a/tests/test_per_object_pyramid.py b/tests/test_per_object_pyramid.py index ea4aa2b..ea83401 100644 --- a/tests/test_per_object_pyramid.py +++ b/tests/test_per_object_pyramid.py @@ -20,7 +20,7 @@ from zarr_vectors.constants import ( CAP_PRESERVED_OBJECT_IDS, - CAP_SHARED_VERTEX_GROUPS, + CAP_SHARED_FRAGMENTS, COARSEN_PER_OBJECT, ) from zarr_vectors.core.arrays import ( @@ -78,14 +78,14 @@ def test_per_object_level_metadata(tmp_path): root = open_store(str(store)) lm = read_level_metadata(root, 1) assert lm.preserves_object_ids is True - assert lm.shared_vertex_groups is True + assert lm.shared_fragments is True assert lm.coarsening_method == COARSEN_PER_OBJECT assert lm.inherited_num_objects == 10 assert lm.parent_level == 0 rm = read_root_metadata(root) assert CAP_PRESERVED_OBJECT_IDS in rm.format_capabilities - assert CAP_SHARED_VERTEX_GROUPS in rm.format_capabilities + assert CAP_SHARED_FRAGMENTS in rm.format_capabilities # =================================================================== diff --git a/tests/test_scaffolding.py b/tests/test_scaffolding.py index 59e1f03..a2a05f2 100644 --- a/tests/test_scaffolding.py +++ b/tests/test_scaffolding.py @@ -48,7 +48,7 @@ def test_import_constants(self) -> None: OBJIDX_STANDARD, OBJIDX_IDENTITY, VALID_GEOMETRY_TYPES, ) - assert FORMAT_VERSION.startswith("0.5") + assert FORMAT_VERSION.startswith("0.6") assert len(VALID_GEOMETRY_TYPES) == 7 diff --git a/zarr_vectors/constants.py b/zarr_vectors/constants.py index e023b31..2cd9487 100644 --- a/zarr_vectors/constants.py +++ b/zarr_vectors/constants.py @@ -9,9 +9,21 @@ # Format version # --------------------------------------------------------------------------- -FORMAT_VERSION: str = "0.5.0" +FORMAT_VERSION: str = "0.6.0" """Current ZV specification version. +0.6.0: fragment-index schema. Replaces ``vertex_group_offsets`` with +``vertex_fragments`` and splits the v0.5 inline-header link blob into +a flat ``links/0/`` payload plus a sibling +``link_fragments/`` group. The fragment-index byte layout +expresses each per-group boundary as either a contiguous index range +``[start, count)`` or an explicit list of row indices, supporting +vertex re-use across fragments. ``object_index/data`` now uses a +per-chunk manifest-block format with single / range / explicit modes; +cross-level link arrays (``cross_chunk_links``, ``delta != 0``) are +unchanged. Hard break: 0.5.x stores are not readable; rewrite from +source. + 0.5.0: NGFF-alignment cleanup + format simplification. The 0.5 series went through several on-disk simplifications without a version bump (consumers should pin to a specific point release): @@ -52,10 +64,16 @@ Dropped objects appear as empty manifest slots and zero ``present_mask`` bytes; ``parent_level`` carries semantic weight.""" -CAP_SHARED_VERTEX_GROUPS: str = "shared_vertex_groups" -"""At least one resolution level stores per-chunk vertex groups that -may be referenced by multiple objects' manifests (shared metavertices -in the per-object pyramid regime).""" +CAP_SHARED_FRAGMENTS: str = "shared_fragments" +"""At least one resolution level stores per-chunk fragments that may +be referenced by multiple objects' manifests (the v0.6 successor to +``shared_vertex_groups``; the sharing primitive is now a fragment +rather than a contiguous-byte vertex group).""" + +CAP_FRAGMENT_INDEX: str = "fragment_index" +"""The store uses the v0.6 fragment-index encoding for ``vertex_fragments`` +and ``link_fragments`` (single uint8 blob per chunk; see +:mod:`zarr_vectors.encoding.fragments`).""" CAP_MULTISCALE_LINKS: str = "multiscale_links" """Store uses the 0.4 multiscale links layout (``links//``, @@ -102,7 +120,16 @@ # --------------------------------------------------------------------------- VERTICES: str = "vertices" -VERTEX_GROUP_OFFSETS: str = "vertex_group_offsets" +VERTEX_FRAGMENTS: str = "vertex_fragments" +"""Per-chunk fragment-index group describing how rows of ``vertices/`` +group into fragments. See :mod:`zarr_vectors.encoding.fragments`.""" + +LINK_FRAGMENTS: str = "link_fragments" +"""Per-chunk fragment-index group describing how rows of +``links/0/`` group into fragments. Exists at ``delta == 0`` +only; cross-level link arrays keep their inline self-describing +header.""" + LINKS: str = "links" VERTEX_ATTRIBUTES: str = "vertex_attributes" OBJECT_INDEX: str = "object_index" @@ -121,7 +148,8 @@ ALL_ARRAY_NAMES: frozenset[str] = frozenset({ VERTICES, - VERTEX_GROUP_OFFSETS, + VERTEX_FRAGMENTS, + LINK_FRAGMENTS, LINKS, VERTEX_ATTRIBUTES, OBJECT_INDEX, diff --git a/zarr_vectors/core/arrays.py b/zarr_vectors/core/arrays.py index ffecf28..a0a5389 100644 --- a/zarr_vectors/core/arrays.py +++ b/zarr_vectors/core/arrays.py @@ -23,11 +23,12 @@ GROUP_ATTRIBUTES, GROUPS, LINK_ATTRIBUTES, + LINK_FRAGMENTS, LINKS, OBJECT_ATTRIBUTES, OBJECT_INDEX, VERTEX_ATTRIBUTES, - VERTEX_GROUP_OFFSETS, + VERTEX_FRAGMENTS, VERTICES, ) from zarr_vectors.core.paths import ( @@ -39,17 +40,20 @@ parse_delta, ) from zarr_vectors.core.store import FsGroup +from zarr_vectors.encoding.fragments import ( + FragmentIndex, + decode_fragments, + decode_object_manifest_blocks, + encode_fragments, + encode_object_manifest_blocks, +) from zarr_vectors.encoding.ragged import ( - decode_object_index, decode_ragged_blob, decode_ragged_ints, decode_vertex_groups, - decode_vertex_offsets, - encode_object_index, encode_ragged_blob, encode_ragged_ints, encode_vertex_groups, - encode_vertex_offsets, ) from zarr_vectors.exceptions import ArrayError from zarr_vectors.typing import ( @@ -122,12 +126,16 @@ def create_vertices_array( encoding: ``"raw"`` or ``"draco"``. """ _ensure_array_dir(level_group, VERTICES) - _ensure_array_dir(level_group, VERTEX_GROUP_OFFSETS) + _ensure_array_dir(level_group, VERTEX_FRAGMENTS) level_group.write_array_meta(VERTICES, { "zv_array": "vertices", "dtype": dtype, "encoding": encoding, }) + level_group.write_array_meta(VERTEX_FRAGMENTS, { + "zv_array": VERTEX_FRAGMENTS, + "encoding": "fragment_index_v1", + }) def create_links_array( @@ -326,8 +334,8 @@ def write_chunk_vertices( """Write vertex groups to a spatial chunk. Encodes the groups as a contiguous byte buffer in ``vertices/`` and - writes the ``(K,)`` int64 vertex byte offsets to - ``vertex_group_offsets/``. + writes a v0.6 fragment-index to ``vertex_fragments/`` describing each + group as a contiguous range of vertex rows in source order. Args: level_group: Resolution level group. @@ -336,17 +344,30 @@ def write_chunk_vertices( dtype: Numpy dtype for serialisation. Returns: - ``(K,)`` int64 array of vertex byte offsets. + ``(K,)`` int64 array of vertex byte offsets (kept for backwards- + compatible signature; callers that need the v0.6 fragment-index + should use :func:`read_vertex_fragment_index`). """ dtype = np.dtype(dtype) key = _chunk_key(chunk_coords) - raw_bytes, vertex_offsets = encode_vertex_groups(groups, dtype) + raw_bytes, vertex_byte_offsets = encode_vertex_groups(groups, dtype) level_group.write_bytes(VERTICES, key, raw_bytes) + + # Express each group as a contiguous (start_row, count) fragment. + if len(groups) == 0: + fragments: list[tuple[int, int]] = [] + else: + per_group_counts = [int(np.asarray(g).shape[0]) for g in groups] + cumulative = 0 + fragments = [] + for n in per_group_counts: + fragments.append((cumulative, n)) + cumulative += n level_group.write_bytes( - VERTEX_GROUP_OFFSETS, key, encode_vertex_offsets(vertex_offsets), + VERTEX_FRAGMENTS, key, encode_fragments(fragments), ) - return vertex_offsets + return vertex_byte_offsets def write_chunk_links( @@ -382,24 +403,59 @@ def write_chunk_links( key = _chunk_key(chunk_coords) full_name = links_path(delta) - if delta == 0 and level_group.chunk_exists(VERTEX_GROUP_OFFSETS, key): - existing = level_group.read_bytes(VERTEX_GROUP_OFFSETS, key) - vertex_offsets = decode_vertex_offsets(existing) - if len(vertex_offsets) != len(link_groups): + if delta == 0 and level_group.chunk_exists(VERTEX_FRAGMENTS, key): + existing_fi = decode_fragments( + level_group.read_bytes(VERTEX_FRAGMENTS, key), + ) + if existing_fi.num_fragments != len(link_groups): raise ArrayError( f"Link group count ({len(link_groups)}) != " - f"vertex group count ({len(vertex_offsets)}) in chunk {key}" + f"vertex fragment count ({existing_fi.num_fragments}) in chunk {key}" ) - # Self-describing blob: per-group byte offsets are packed in an - # inline header followed by the concatenated link data. + if delta == 0: + # v0.6 intra-level: flat concatenated link data + sibling + # link_fragments/ describing per-group row ranges. + data_bytes, link_byte_offsets = encode_ragged_ints(link_groups, dtype) + level_group.write_bytes(full_name, key, data_bytes) + + link_row_size = dtype.itemsize * ( + int(np.asarray(link_groups[0]).shape[1]) if ( + link_groups and np.asarray(link_groups[0]).ndim == 2 + ) else 1 + ) + # Fragment per group as a contiguous range of link rows. + if len(link_groups) == 0: + link_fragments: list[tuple[int, int]] = [] + else: + cumulative = 0 + link_fragments = [] + for g in link_groups: + n = int(np.asarray(g).shape[0]) if np.asarray(g).ndim >= 1 else 0 + link_fragments.append((cumulative, n)) + cumulative += n + # Ensure the sibling array group exists. + if not level_group.chunk_exists(LINK_FRAGMENTS, key): + level_group.require_group(LINK_FRAGMENTS) + try: + level_group.read_array_meta(LINK_FRAGMENTS) + except Exception: + level_group.write_array_meta(LINK_FRAGMENTS, { + "zv_array": LINK_FRAGMENTS, + "encoding": "fragment_index_v1", + }) + level_group.write_bytes( + LINK_FRAGMENTS, key, encode_fragments(link_fragments), + ) + del link_row_size # silence unused-variable warning + return link_byte_offsets + + # delta != 0: cross-level links keep the v0.5 inline self-describing + # layout (out of scope for the v0.6 fragment-index refactor). blob = encode_ragged_blob(link_groups, dtype) level_group.write_bytes(full_name, key, blob) - - # Recover the per-group byte offsets (relative to the data section) - # for the return value. - _, link_offsets = encode_ragged_ints(link_groups, dtype) - return link_offsets + _, link_byte_offsets = encode_ragged_ints(link_groups, dtype) + return link_byte_offsets def write_chunk_attributes( @@ -492,9 +548,26 @@ def write_object_index( for oid in range(size): manifest_list.append(manifests.get(oid, [])) - raw_bytes, offsets = encode_object_index(manifest_list, sid_ndim) + # v0.6 manifest-block encoding. Each old (chunk, vg_index) tuple + # becomes one mode-0 (single fragment) block. Range / explicit + # short-circuits are reserved for writers that know they produce + # ranges or fragment-list shapes — to be plumbed through the + # higher-level write APIs in a future change. + blob_parts: list[bytes] = [] + offsets_arr = np.empty(len(manifest_list), dtype=np.int64) + cur = 0 + for i, manifest in enumerate(manifest_list): + blocks = [ + (tuple(int(c) for c in chunk_coords), int(vg_index)) + for chunk_coords, vg_index in manifest + ] + blob = encode_object_manifest_blocks(blocks, sid_ndim=sid_ndim) + blob_parts.append(blob) + offsets_arr[i] = cur + cur += len(blob) + raw_bytes = b"".join(blob_parts) level_group.write_bytes(OBJECT_INDEX, "data", raw_bytes) - level_group.write_bytes(OBJECT_INDEX, "offsets", offsets.tobytes()) + level_group.write_bytes(OBJECT_INDEX, "offsets", offsets_arr.tobytes()) level_group.write_array_meta(OBJECT_INDEX, { "zv_array": "object_index", "num_objects": size, @@ -775,7 +848,10 @@ def read_chunk_vertices( except Exception as e: raise ArrayError(f"Cannot read vertices chunk {key}: {e}") from e - offsets = _read_vertex_offsets(level_group, chunk_coords) + bytes_per_vertex = int(dtype.itemsize) * int(ndim) + offsets = _read_vertex_offsets( + level_group, chunk_coords, bytes_per_vertex=bytes_per_vertex, + ) return decode_vertex_groups(raw, offsets, dtype, ndim) @@ -802,7 +878,10 @@ def read_vertex_group( dtype = np.dtype(dtype) raw = level_group.read_bytes(VERTICES, key) - offsets = _read_vertex_offsets(level_group, chunk_coords) + bytes_per_vertex = int(dtype.itemsize) * int(ndim) + offsets = _read_vertex_offsets( + level_group, chunk_coords, bytes_per_vertex=bytes_per_vertex, + ) if vg_index < 0 or vg_index >= len(offsets): raise ArrayError( @@ -856,6 +935,30 @@ def read_chunk_links( f"Cannot read links chunk {key} (delta={format_delta(delta)}): {e}" ) from e + if delta == 0: + # v0.6 intra-level layout: raw is the flat concatenated link + # data; per-group row counts live in link_fragments/. + fi = read_link_fragment_index(level_group, chunk_coords) + groups: list[npt.NDArray[np.integer]] = [] + row_size = int(dtype.itemsize) * int(link_width) + for f in range(fi.num_fragments): + if not fi.is_range(f): + raise ArrayError( + f"link_fragments/{key} fragment {f} is non-contiguous; " + "read_chunk_links requires every fragment to be a " + "contiguous range over rows of links/0/.", + ) + start, count = fi.range(f) + byte_lo = int(start) * row_size + byte_hi = byte_lo + int(count) * row_size + segment = raw[byte_lo:byte_hi] + arr = np.frombuffer(segment, dtype=dtype) + if link_width > 1: + arr = arr.reshape(-1, link_width) + groups.append(arr) + return groups + + # Cross-level delta != 0: v0.5 inline self-describing layout. return decode_ragged_blob(raw, dtype, ncols=link_width) @@ -962,25 +1065,31 @@ def _derive_attribute_offsets( vertex data, which corresponds to ``n_k`` vertices (and therefore ``n_k`` attribute rows). """ - vert_offsets = _read_vertex_offsets(level_group, chunk_coords) - if len(vert_offsets) == 0: - return np.empty(0, dtype=np.int64) vert_row_size = vert_dtype.itemsize * vert_ndim if vert_row_size <= 0: return np.empty(0, dtype=np.int64) - # Vertex byte size per group → vertex count per group. - key = _chunk_key(chunk_coords) - vert_total = len(level_group.read_bytes(VERTICES, key)) - ends = np.empty_like(vert_offsets) - if len(vert_offsets) > 1: - ends[:-1] = vert_offsets[1:] - ends[-1] = vert_total - n_per_group = (ends - vert_offsets) // vert_row_size + fi = read_vertex_fragment_index(level_group, chunk_coords) + if fi.num_fragments == 0: + return np.empty(0, dtype=np.int64) + # Per-fragment vertex row count. Today's writers always emit + # range fragments; non-contiguous shapes will need a richer + # attribute-alignment story (out of scope for this change). + n_per_group = np.empty(fi.num_fragments, dtype=np.int64) + for f in range(fi.num_fragments): + if not fi.is_range(f): + raise ArrayError( + f"vertex_fragments fragment {f} is non-contiguous; " + "attribute alignment requires every fragment to be a " + "contiguous range of vertex rows.", + ) + _start, count = fi.range(f) + n_per_group[f] = int(count) attr_row_size = attr_dtype.itemsize * attr_ncols - attr_byte_lengths = n_per_group.astype(np.int64) * int(attr_row_size) + attr_byte_lengths = n_per_group * int(attr_row_size) attr_offsets = np.empty_like(attr_byte_lengths) attr_offsets[0] = 0 np.cumsum(attr_byte_lengths[:-1], out=attr_offsets[1:]) + del total_attr_bytes # signature retained for caller compat return attr_offsets @@ -1010,8 +1119,7 @@ def read_object_manifest( level_group.read_bytes(OBJECT_INDEX, "offsets"), dtype=np.int64, ) - all_manifests = decode_object_index(raw, offsets, sid_ndim) - return all_manifests[object_id] + return _decode_one_manifest(raw, offsets, sid_ndim, object_id, num_objects) def read_all_object_manifests( @@ -1024,13 +1132,57 @@ def read_all_object_manifests( """ meta = level_group.read_array_meta(OBJECT_INDEX) sid_ndim = meta["sid_ndim"] + num_objects = int(meta.get("num_objects", 0)) raw = level_group.read_bytes(OBJECT_INDEX, "data") offsets = np.frombuffer( level_group.read_bytes(OBJECT_INDEX, "offsets"), dtype=np.int64, ) - return list(decode_object_index(raw, offsets, sid_ndim)) + return [ + _decode_one_manifest(raw, offsets, sid_ndim, i, num_objects) + for i in range(num_objects) + ] + + +def _decode_one_manifest( + data: bytes, + offsets: npt.NDArray[np.int64], + sid_ndim: int, + object_id: int, + num_objects: int, +) -> ObjectManifest: + """Slice + decode one object's manifest from the v0.6 block-encoded + ``object_index/data`` blob and expand to the legacy + ``[(chunk_coords, vg_index), ...]`` tuple list. + + Mode-1 (range) and mode-2 (explicit list) blocks expand to one + tuple per fragment so existing call sites that iterate + ``(chunk_coords, vg_index)`` keep working unchanged. Callers that + want the raw block representation can use + :func:`zarr_vectors.encoding.fragments.decode_object_manifest_blocks` + directly. + """ + start = int(offsets[object_id]) + end = ( + int(offsets[object_id + 1]) + if object_id + 1 < num_objects + else len(data) + ) + blocks = decode_object_manifest_blocks(data[start:end], sid_ndim=sid_ndim) + out: ObjectManifest = [] + for chunk_coords, frag_ref in blocks: + if isinstance(frag_ref, int): + out.append((chunk_coords, int(frag_ref))) + elif isinstance(frag_ref, tuple): + r_start, r_count = frag_ref + for k in range(int(r_count)): + out.append((chunk_coords, int(r_start) + k)) + else: + # np.ndarray of explicit indices + for idx in frag_ref: + out.append((chunk_coords, int(idx))) + return out def read_object_vertices( @@ -1374,22 +1526,81 @@ def count_vertex_groups( level_group: FsGroup, chunk_coords: ChunkCoords, ) -> int: - """Count vertex groups in a chunk (from offsets array).""" - offsets = _read_vertex_offsets(level_group, chunk_coords) - return len(offsets) + """Count vertex groups in a chunk by reading the fragment-index header.""" + return len(read_vertex_fragment_index(level_group, chunk_coords)) # =================================================================== # Internal helpers # =================================================================== +def read_vertex_fragment_index( + level_group: FsGroup, + chunk_coords: ChunkCoords, +) -> FragmentIndex: + """Read and decode the ``vertex_fragments/`` blob. + + Returns the v0.6 :class:`FragmentIndex` view describing how rows of + ``vertices/`` partition into fragments. + """ + key = _chunk_key(chunk_coords) + raw = level_group.read_bytes(VERTEX_FRAGMENTS, key) + return decode_fragments(raw) + + +def read_link_fragment_index( + level_group: FsGroup, + chunk_coords: ChunkCoords, +) -> FragmentIndex: + """Read and decode the ``link_fragments/`` blob (delta=0 only).""" + key = _chunk_key(chunk_coords) + raw = level_group.read_bytes(LINK_FRAGMENTS, key) + return decode_fragments(raw) + + def _read_vertex_offsets( level_group: FsGroup, chunk_coords: ChunkCoords, + *, + bytes_per_vertex: int | None = None, ) -> npt.NDArray[np.int64]: - """Read the ``(K,)`` int64 vertex byte offsets for a chunk.""" - key = _chunk_key(chunk_coords) - raw = level_group.read_bytes(VERTEX_GROUP_OFFSETS, key) - return decode_vertex_offsets(raw) + """Read the ``(K,)`` int64 vertex byte offsets for a chunk. + + Computed from the v0.6 ``vertex_fragments/`` index. Every + fragment must be a contiguous range over rows of ``vertices/`` + — the only shape the existing writer produces. Stores written by + future writers that materialise non-contiguous / shared-row + fragments must use the higher-level fragment-index API directly; + this helper raises rather than silently lying about a byte offset + that doesn't exist. + + Args: + level_group: Resolution level group. + chunk_coords: Spatial chunk coordinates. + bytes_per_vertex: Bytes per vertex row. When omitted it is + inferred from the ``vertices/`` array's ``dtype`` metadata + and the root NGFF axes count. + """ + if bytes_per_vertex is None: + vmeta = level_group.read_array_meta(VERTICES) + vdtype = np.dtype(vmeta.get("dtype", "float32")) + ndim = _infer_vert_ndim(level_group) + bytes_per_vertex = int(vdtype.itemsize) * int(ndim) + fi = read_vertex_fragment_index(level_group, chunk_coords) + if fi.num_fragments == 0: + return np.empty(0, dtype=np.int64) + offsets = np.empty(fi.num_fragments, dtype=np.int64) + for i in range(fi.num_fragments): + if not fi.is_range(i): + raise ArrayError( + f"vertex_fragments/{_chunk_key(chunk_coords)} fragment {i} " + "is non-contiguous; byte-offset access requires every " + "fragment to be a contiguous range over rows of " + "vertices/. Use read_vertex_fragment_index() " + "directly for non-contiguous fragments.", + ) + start, _count = fi.range(i) + offsets[i] = int(start) * int(bytes_per_vertex) + return offsets diff --git a/zarr_vectors/core/group.py b/zarr_vectors/core/group.py index ab8d31e..e73abec 100644 --- a/zarr_vectors/core/group.py +++ b/zarr_vectors/core/group.py @@ -181,7 +181,7 @@ def batched_reads( chunk_keys = list_chunk_keys(level_group, VERTICES) with level_group.batched_reads([ (VERTICES, chunk_keys), - (VERTEX_GROUP_OFFSETS, chunk_keys), + (VERTEX_FRAGMENTS, chunk_keys), *((f"{VERTEX_ATTRIBUTES}/{a}", chunk_keys) for a in attrs), ]): for cc in chunk_keys: diff --git a/zarr_vectors/core/metadata.py b/zarr_vectors/core/metadata.py index 428ba87..f4bc6f3 100644 --- a/zarr_vectors/core/metadata.py +++ b/zarr_vectors/core/metadata.py @@ -301,7 +301,11 @@ def validate(self) -> None: # were duplicated under # ``zarr_vectors.spatial_index_dims``; per-array # ``.zattrs`` carried a redundant ``dtype`` field. - # No shims ship; older stores must be re-written. + # - pre-0.6.0 : ``vertex_group_offsets`` instead of + # ``vertex_fragments``; links carried an inline + # self-describing header; ``object_index`` used + # the flat quad encoding. + # No shims ship; older stores must be rewritten from source. parts = self.zv_version.split(".") try: major = int(parts[0]) @@ -311,14 +315,12 @@ def validate(self) -> None: raise MetadataError( f"zv_version {self.zv_version!r} is not a valid X.Y[.Z] string" ) from exc - if (major, minor, patch) < (0, 5, 0): + if (major, minor, patch) < (0, 6, 0): raise MetadataError( f"store zv_version is {self.zv_version}; this build " f"requires {FORMAT_VERSION} — no backwards-compat shim. " - f"Pre-0.5 stores used the legacy ``format_version`` key " - f"and duplicated axes under " - f"``zarr_vectors.spatial_index_dims``. Re-write the " - f"store with a {FORMAT_VERSION} writer." + f"0.5.x stores used ``vertex_group_offsets`` (byte offsets) " + f"and an inline link-blob header; rewrite from source." ) if self.cross_level_storage not in VALID_XLEVEL_STORAGE: @@ -580,7 +582,7 @@ class LevelMetadata: """OID-space size inherited from the parent level (= ``parent_level.num_objects``). Lets readers allocate lookup arrays without traversing parent metadata.""" - shared_vertex_groups: bool = False + shared_fragments: bool = False """True when per-chunk vertex groups represent metavertices that may be referenced by multiple objects' manifests (the shared- metavertex case).""" @@ -610,8 +612,8 @@ def to_dict(self) -> dict[str, Any]: d["preserves_object_ids"] = True if self.inherited_num_objects is not None: d["inherited_num_objects"] = int(self.inherited_num_objects) - if self.shared_vertex_groups: - d["shared_vertex_groups"] = True + if self.shared_fragments: + d["shared_fragments"] = True return {"zarr_vectors_level": d} @classmethod @@ -654,7 +656,7 @@ def from_dict(cls, d: dict[str, Any]) -> LevelMetadata: chunk_attribute_values=list(cav) if cav is not None else None, preserves_object_ids=bool(lv.get("preserves_object_ids", False)), inherited_num_objects=lv.get("inherited_num_objects"), - shared_vertex_groups=bool(lv.get("shared_vertex_groups", False)), + shared_fragments=bool(lv.get("shared_fragments", False)), ) def validate(self) -> None: diff --git a/zarr_vectors/encoding/compression.py b/zarr_vectors/encoding/compression.py index 0a1f8be..5b36563 100644 --- a/zarr_vectors/encoding/compression.py +++ b/zarr_vectors/encoding/compression.py @@ -13,11 +13,12 @@ GROUP_ATTRIBUTES, GROUPS, LINK_ATTRIBUTES, + LINK_FRAGMENTS, LINKS, OBJECT_ATTRIBUTES, OBJECT_INDEX, VERTEX_ATTRIBUTES, - VERTEX_GROUP_OFFSETS, + VERTEX_FRAGMENTS, VERTICES, ) @@ -42,8 +43,10 @@ def get_default_compressor(array_type: str) -> dict[str, object]: "shuffle": 2, # SHUFFLE_BITSHUFFLE — good for correlated ints } - # Offsets are monotonically increasing integers — delta + compress - if array_type in (VERTEX_GROUP_OFFSETS, OBJECT_INDEX): + # Fragment-index blobs (v0.6) and the object_index blob mix int64 + # range tables with uint32 CSR offsets — byte-shuffle decorrelates + # the high zero bytes well across the heterogeneous payload. + if array_type in (VERTEX_FRAGMENTS, LINK_FRAGMENTS, OBJECT_INDEX): return { "id": "blosc", "cname": "zstd", diff --git a/zarr_vectors/encoding/fragments.py b/zarr_vectors/encoding/fragments.py new file mode 100644 index 0000000..75fba7d --- /dev/null +++ b/zarr_vectors/encoding/fragments.py @@ -0,0 +1,682 @@ +"""Fragment-index encoding and decoding for per-chunk ragged structures. + +A *fragment index* describes the F fragments inside one spatial chunk's +``vertex_fragments/`` or ``link_fragments/`` blob. Each +fragment is either: + +* a contiguous **range** ``[start, start+count)`` of row indices into the + chunk's ``vertices/`` (or ``links/0/``) array, or +* an explicit **list** of row indices, allowing two fragments in the same + chunk to re-use the same underlying vertex/link. + +The on-disk byte layout is fixed at version 1: + +.. code-block:: text + + HEADER (16 bytes, 8-byte-aligned) + uint32 magic = 0x5A56_4647 ('ZVFG') + uint16 version = 1 + uint16 flags = 0 (reserved) + uint32 num_fragments F + uint32 num_range_fragments R (popcount of bitmap; redundant) + + RANGE BITMAP + ceil(F/8) bytes, padded to next 8-byte boundary + bit f (LSB-first within byte f//8) = 1 iff fragment f is a range + + RANGE TABLE (R entries × 16 bytes) + int64 start, int64 count per range fragment, fragment-index order + + EXPLICIT CSR (E = F − R entries) + uint32 explicit_offsets[E+1] running offsets into explicit_indices + int64 explicit_indices[T] concatenated indices, T = explicit_offsets[E] + +The layout is designed so that :meth:`FragmentIndex.is_range` is a single +bit lookup — no fragment payload has to be decoded to answer "is this +fragment a contiguous range?". Random access to a fragment's representation +goes through a one-time prefix-popcount cache built lazily on first call. +""" + +from __future__ import annotations + +import struct +from dataclasses import dataclass, field +from typing import Sequence + +import numpy as np +import numpy.typing as npt + +from zarr_vectors.exceptions import ArrayError + +# Public constants --------------------------------------------------- + +#: Magic number written at the start of every fragment-index blob. +FRAGMENT_INDEX_MAGIC: int = 0x5A56_4647 # 'ZVFG' + +#: On-disk format version this module reads and writes. +FRAGMENT_INDEX_VERSION: int = 1 + +#: Header length in bytes (kept 8-byte-aligned for downstream int64 fields). +_HEADER_BYTES: int = 16 + +#: Header struct: magic, version, flags, num_fragments, num_range_fragments. +_HEADER_STRUCT = struct.Struct(" int: + """Number of bytes the range bitmap occupies on disk, including padding + to the next 8-byte boundary so the subsequent int64 range table is + naturally aligned.""" + raw = (num_fragments + 7) // 8 + return (raw + 7) & ~7 + + +# Encoding ----------------------------------------------------------- + + +Fragment = npt.NDArray[np.integer] | tuple[int, int] + + +def _classify_fragment( + fragment: Fragment, *, force_explicit: bool, +) -> tuple[bool, tuple[int, int] | np.ndarray]: + """Return ``(is_range, payload)`` for one input fragment. + + ``payload`` is either ``(start, count)`` for the range path or an + ``np.ndarray[int64]`` for the explicit path. Auto-detection looks + for any non-negative array where ``arr == arange(arr[0], arr[0]+len(arr))``; + pass ``force_explicit=True`` to bypass the check. + """ + if isinstance(fragment, tuple): + if len(fragment) != 2: + raise ArrayError( + f"Fragment tuple must have shape (start, count), got {fragment!r}", + ) + start, count = int(fragment[0]), int(fragment[1]) + if count < 0: + raise ArrayError(f"Fragment count must be >= 0, got {count}") + return True, (start, count) + + arr = np.asarray(fragment) + if arr.ndim != 1: + raise ArrayError( + f"Fragment array must be 1-D, got shape {arr.shape}", + ) + arr = arr.astype(np.int64, copy=False) + if not force_explicit and arr.size > 0: + start = int(arr[0]) + # Cheap arange-detect: compare to arange. For small fragments + # this is bounded; for very long fragments we still want the + # range path because it's by far the more compressible + # representation. + if start >= 0 and np.array_equal( + arr, np.arange(start, start + arr.size, dtype=np.int64), + ): + return True, (start, int(arr.size)) + if arr.size == 0: + # Zero-length explicit slice — preserve the explicit path so the + # caller can distinguish an empty list from an empty range if + # they care; but for the on-disk encoding both shapes have the + # same cost (1 csr_offset slot, 0 indices), so this is mostly a + # round-tripping detail. + return False, arr + if int(arr.min()) < 0: + raise ArrayError( + "Explicit fragment indices must be non-negative", + ) + return False, arr + + +def encode_fragments( + fragments: Sequence[Fragment], + *, + force_explicit: bool = False, +) -> bytes: + """Encode F fragments into the v1 byte layout. + + Args: + fragments: One entry per fragment. Each entry is either a + ``(start, count)`` tuple (always emitted as a range), or a + 1-D integer array (auto-detected as a range when it equals + ``arange(arr[0], arr[0]+len(arr))``, else explicit). + force_explicit: If True, never auto-promote arrays to the range + path. Tuples are still emitted as ranges — pass an array if + you need the explicit path unconditionally. + + Returns: + The fragment-index blob, suitable for writing as the single + chunk of a 1-D uint8 zarr v3 array. + """ + f = len(fragments) + if f == 0: + # Header-only sentinel for empty chunks. + return _HEADER_STRUCT.pack( + FRAGMENT_INDEX_MAGIC, + FRAGMENT_INDEX_VERSION, + 0, # flags + 0, # num_fragments + 0, # num_range_fragments + ) + + classified: list[tuple[bool, tuple[int, int] | np.ndarray]] = [ + _classify_fragment(frag, force_explicit=force_explicit) + for frag in fragments + ] + + # Bitmap (range bits) ------------------------------------------------ + bitmap_len = _bitmap_padded_length(f) + bitmap = bytearray(bitmap_len) + r = 0 + for i, (is_range, _) in enumerate(classified): + if is_range: + bitmap[i >> 3] |= 1 << (i & 7) + r += 1 + e = f - r + + # Range table -------------------------------------------------------- + range_table = np.empty((r, 2), dtype=np.int64) + r_idx = 0 + for is_range, payload in classified: + if is_range: + start, count = payload # type: ignore[misc] + range_table[r_idx, 0] = start + range_table[r_idx, 1] = count + r_idx += 1 + + # Explicit CSR ------------------------------------------------------- + explicit_indices_list: list[np.ndarray] = [] + explicit_offsets = np.empty(e + 1, dtype=np.uint32) + explicit_offsets[0] = 0 + e_idx = 0 + for is_range, payload in classified: + if not is_range: + arr = payload # type: ignore[assignment] + explicit_indices_list.append(arr) + e_idx += 1 + explicit_offsets[e_idx] = explicit_offsets[e_idx - 1] + arr.size + + if explicit_indices_list: + explicit_indices = np.concatenate( + explicit_indices_list, dtype=np.int64, + ) + else: + explicit_indices = np.empty(0, dtype=np.int64) + + # Pack -------------------------------------------------------------- + header = _HEADER_STRUCT.pack( + FRAGMENT_INDEX_MAGIC, + FRAGMENT_INDEX_VERSION, + 0, # flags + f, + r, + ) + parts: list[bytes] = [ + header, + bytes(bitmap), + range_table.tobytes(), + explicit_offsets.tobytes(), + explicit_indices.tobytes(), + ] + return b"".join(parts) + + +# Decoding ----------------------------------------------------------- + + +@dataclass(frozen=True) +class FragmentIndex: + """Decoded view of one chunk's fragment-index blob. + + The dataclass holds zero-copy numpy views onto the source bytes + (where dtype-alignment allows) plus a lazily-built prefix-popcount + cache for random access. ``is_range(f)`` is always a single bit + lookup — no fragment payload is materialised until ``range(f)``, + ``indices(f)`` or ``indices_view(f)`` is called. + """ + + num_fragments: int + _bitmap: np.ndarray # uint8, length ceil(F/8) (unpadded view) + _range_table: np.ndarray # (R, 2) int64 + _csr_offsets: np.ndarray # (E+1,) uint32 + _csr_indices: np.ndarray # (T,) int64 + # Lazy cache: prefix-popcount of the *bitmap* (inclusive at index i+1). + # _popcount_prefix[k] == number of range fragments in [0, k). + # Wrapped in a list so the frozen dataclass can mutate it. + _popcount_cache: list = field(default_factory=list) + + # Public API -------------------------------------------------------- + + def __len__(self) -> int: + return self.num_fragments + + @property + def num_range_fragments(self) -> int: + return int(self._range_table.shape[0]) + + @property + def num_explicit_fragments(self) -> int: + return self.num_fragments - self.num_range_fragments + + def is_range(self, f: int) -> bool: + """Return True if fragment ``f`` is a contiguous range. + + One bit lookup — does not decode any fragment payload. + """ + if f < 0 or f >= self.num_fragments: + raise IndexError( + f"Fragment index {f} out of range [0, {self.num_fragments})", + ) + return bool((self._bitmap[f >> 3] >> (f & 7)) & 1) + + def range(self, f: int) -> tuple[int, int]: + """Return ``(start, count)`` for a range fragment. + + Raises: + ArrayError: If fragment ``f`` is not a range. + """ + if not self.is_range(f): + raise ArrayError( + f"Fragment {f} is explicit, not a range; " + f"use .indices(f) or .indices_view(f)", + ) + row = self._popcount_prefix()[f] + start = int(self._range_table[row, 0]) + count = int(self._range_table[row, 1]) + return start, count + + def indices(self, f: int) -> npt.NDArray[np.int64]: + """Return the indices of fragment ``f`` as a 1-D ``int64`` array. + + For range fragments this materialises ``arange(start, start+count)``. + For explicit fragments this returns a copy of the CSR slice (use + :meth:`indices_view` for a zero-copy view). + """ + if self.is_range(f): + start, count = self.range(f) + return np.arange(start, start + count, dtype=np.int64) + prefix = self._popcount_prefix() + e_idx = f - prefix[f] + a = int(self._csr_offsets[e_idx]) + b = int(self._csr_offsets[e_idx + 1]) + return self._csr_indices[a:b].copy() + + def indices_view(self, f: int) -> npt.NDArray[np.int64]: + """Return a zero-copy view onto the indices of an explicit fragment. + + Raises: + ArrayError: If fragment ``f`` is a range — there is no + backing array to view; call :meth:`indices` instead. + """ + if self.is_range(f): + raise ArrayError( + f"Fragment {f} is a range; .indices_view requires an " + f"explicit fragment. Use .indices(f) to materialise.", + ) + prefix = self._popcount_prefix() + e_idx = f - prefix[f] + a = int(self._csr_offsets[e_idx]) + b = int(self._csr_offsets[e_idx + 1]) + return self._csr_indices[a:b] + + # Internals --------------------------------------------------------- + + def _popcount_prefix(self) -> npt.NDArray[np.int32]: + """Return (and lazily build) the bitmap prefix-popcount. + + ``_popcount_prefix[i]`` == count of set bits in bitmap[0..i) + (i.e. how many fragments in ``[0, i)`` are ranges). Useful for + translating a fragment index ``f`` into a row of ``_range_table`` + (``prefix[f]`` if ``is_range(f)``) or ``_csr_offsets`` + (``f - prefix[f]`` if not). + """ + if self._popcount_cache: + return self._popcount_cache[0] + # One pass over the bitmap, then a cumulative sum. This is + # O(F) once per FragmentIndex; subsequent queries are O(1). + f = self.num_fragments + bits = np.unpackbits( + self._bitmap, bitorder="little", + )[:f].astype(np.int32, copy=False) + prefix = np.empty(f + 1, dtype=np.int32) + prefix[0] = 0 + np.cumsum(bits, out=prefix[1:]) + self._popcount_cache.append(prefix) + return prefix + + +def decode_fragments(raw: bytes) -> FragmentIndex: + """Parse a v1 fragment-index blob into a :class:`FragmentIndex`.""" + if len(raw) < _HEADER_BYTES: + raise ArrayError( + f"Fragment-index blob too short: {len(raw)} < {_HEADER_BYTES}", + ) + magic, version, flags, f, r = _HEADER_STRUCT.unpack_from(raw, 0) + if magic != FRAGMENT_INDEX_MAGIC: + raise ArrayError( + f"Bad fragment-index magic: got 0x{magic:08X}, " + f"want 0x{FRAGMENT_INDEX_MAGIC:08X}", + ) + if version != FRAGMENT_INDEX_VERSION: + raise ArrayError( + f"Unsupported fragment-index version {version}; " + f"this code reads version {FRAGMENT_INDEX_VERSION}", + ) + if flags != 0: + raise ArrayError( + f"Unsupported fragment-index flags 0x{flags:04X}; expected 0", + ) + if r > f: + raise ArrayError( + f"num_range_fragments {r} exceeds num_fragments {f}", + ) + + if f == 0: + return FragmentIndex( + num_fragments=0, + _bitmap=np.empty(0, dtype=np.uint8), + _range_table=np.empty((0, 2), dtype=np.int64), + _csr_offsets=np.zeros(1, dtype=np.uint32), + _csr_indices=np.empty(0, dtype=np.int64), + ) + + offset = _HEADER_BYTES + + bitmap_raw_bytes = (f + 7) // 8 + bitmap_padded = _bitmap_padded_length(f) + if len(raw) < offset + bitmap_padded: + raise ArrayError( + f"Fragment-index blob truncated in bitmap region", + ) + # Copy out the unpadded portion as our canonical bitmap. Copying + # is cheap (≤ ceil(F/8) bytes) and avoids retaining the whole input + # buffer just for a tiny view. + bitmap = np.frombuffer( + raw, dtype=np.uint8, count=bitmap_raw_bytes, offset=offset, + ).copy() + offset += bitmap_padded + + range_table_bytes = r * 16 + if len(raw) < offset + range_table_bytes: + raise ArrayError( + f"Fragment-index blob truncated in range table", + ) + range_table = np.frombuffer( + raw, dtype=np.int64, count=r * 2, offset=offset, + ).reshape(r, 2).copy() + offset += range_table_bytes + + e = f - r + csr_offsets_bytes = (e + 1) * 4 + if len(raw) < offset + csr_offsets_bytes: + raise ArrayError( + f"Fragment-index blob truncated in CSR offsets", + ) + csr_offsets = np.frombuffer( + raw, dtype=np.uint32, count=e + 1, offset=offset, + ).copy() + offset += csr_offsets_bytes + + t = int(csr_offsets[e]) if e > 0 else 0 + csr_indices_bytes = t * 8 + if len(raw) < offset + csr_indices_bytes: + raise ArrayError( + f"Fragment-index blob truncated in CSR indices", + ) + csr_indices = np.frombuffer( + raw, dtype=np.int64, count=t, offset=offset, + ).copy() + + return FragmentIndex( + num_fragments=f, + _bitmap=bitmap, + _range_table=range_table, + _csr_offsets=csr_offsets, + _csr_indices=csr_indices, + ) + + +def read_fragment_index( + level_group, + array_name: str, + chunk_coords: tuple[int, ...], +) -> FragmentIndex: + """Read and decode the fragment-index blob for one chunk. + + ``level_group`` is a :class:`zarr_vectors.core.group.FsGroup` (or + equivalent) exposing :meth:`read_bytes` and + :meth:`chunk_exists`. Returns an empty :class:`FragmentIndex` when + the chunk's blob is absent — callers that need to distinguish + "missing" from "empty fragment list" should check + ``level_group.chunk_exists(array_name, chunk_key)`` first. + """ + chunk_key = ".".join(str(c) for c in chunk_coords) + raw = level_group.read_bytes(array_name, chunk_key) + return decode_fragments(raw) + + +# --------------------------------------------------------------------------- +# Object-index manifest blocks +# --------------------------------------------------------------------------- +# +# Each object's manifest is a sequence of per-chunk *blocks*. A block +# carries one chunk's coordinates plus a fragment reference encoded in +# one of three modes: +# +# mode 0 uint8 single fragment +# int64 fragment_index +# mode 1 uint8 contiguous range +# int64 start, int64 count +# mode 2 uint8 explicit list +# uint32 count +# int64 fragment_indices[count] +# +# The manifest is preceded by: +# +# uint32 num_blocks B +# +# All fragment references are *chunk-local* — they index into the +# ``vertex_fragments/`` array of the block's named chunk +# only, never across chunks. This preserves chunk-write independence: +# chunks can be written without coordinating fragment numbering with +# any other chunk. +# +# An empty manifest is 4 bytes: ``B=0``. + +# Mode tags +MANIFEST_MODE_SINGLE = 0 +MANIFEST_MODE_RANGE = 1 +MANIFEST_MODE_EXPLICIT = 2 + + +ObjectManifestBlock = tuple[ + tuple[int, ...], # chunk_coords + npt.NDArray[np.integer] | tuple[int, int] | int, # fragment ref +] + + +def _encode_one_block( + chunk_coords: tuple[int, ...], + fragment_ref, + sid_ndim: int, + *, + force_explicit: bool, +) -> bytes: + if len(chunk_coords) != sid_ndim: + raise ArrayError( + f"chunk_coords {chunk_coords} has rank {len(chunk_coords)}, " + f"expected sid_ndim={sid_ndim}", + ) + coords_bytes = np.asarray(chunk_coords, dtype=np.int64).tobytes() + + # int → single + if isinstance(fragment_ref, (int, np.integer)): + idx = int(fragment_ref) + if idx < 0: + raise ArrayError( + f"fragment_index must be >= 0, got {idx}", + ) + return ( + coords_bytes + + struct.pack("= 0, got {count}") + return ( + coords_bytes + + struct.pack(" 0: + start = int(arr[0]) + if start >= 0 and np.array_equal( + arr, np.arange(start, start + arr.size, dtype=np.int64), + ): + return ( + coords_bytes + + struct.pack(" 0 and int(arr.min()) < 0: + raise ArrayError( + "Explicit fragment indices must be non-negative", + ) + return ( + coords_bytes + + struct.pack(" bytes: + """Encode one object's manifest into the v0.6 block format. + + Args: + blocks: List of ``(chunk_coords, fragment_ref)`` pairs. Each + block names one spatial chunk and the fragment(s) the + object owns inside that chunk. ``fragment_ref`` may be: + + - an ``int`` (single fragment index, mode 0), + - a ``(start, count)`` tuple (contiguous range, mode 1), or + - a 1-D integer array (auto-detected: arange → range, + otherwise explicit list). + sid_ndim: Rank of the chunk-coordinate space. All + ``chunk_coords`` tuples must be of this length. + force_explicit: If True, never auto-promote arrays to range. + + Returns: + The manifest bytes — a 4-byte block count followed by ``B`` + per-block payloads. An empty list returns 4 bytes (``B=0``). + """ + block_bytes = [ + _encode_one_block(c, f, sid_ndim, force_explicit=force_explicit) + for c, f in blocks + ] + return struct.pack(" list[ + tuple[ + tuple[int, ...], + int | tuple[int, int] | npt.NDArray[np.int64], + ] +]: + """Decode a v0.6 manifest blob into ``(chunk_coords, fragment_ref)`` + pairs. + + Each returned ``fragment_ref`` is one of: + + - ``int`` for mode-0 blocks, + - ``(start, count)`` tuple for mode-1 blocks, + - ``np.ndarray[int64]`` for mode-2 blocks. + + Callers that prefer a uniform shape (e.g. always an array of indices) + should map over the result. + """ + if len(raw) < 4: + raise ArrayError( + f"Manifest blob too short: {len(raw)} < 4", + ) + (b,) = struct.unpack_from(" tuple[bytes, npt.NDArray[np.int64]]: - """Encode a list of object manifests into a byte buffer. - - Each manifest is a list of ``(chunk_coords, vertex_group_index)`` tuples. - These are flattened to ``(sid_ndim + 1)`` ints per entry and concatenated. - - Args: - manifests: ``[manifest_0, manifest_1, ...]`` where each manifest is - ``[(chunk_coords, vg_index), ...]``. - sid_ndim: Number of spatial index dimensions (e.g. 3 for XYZ). - - Returns: - raw_bytes: Concatenated byte buffer of all manifests. - offsets: ``(O,)`` int64 byte offset array, one per object. - """ - entry_len = sid_ndim + 1 - dtype = np.dtype(np.int64) - parts: list[bytes] = [] - offsets: list[int] = [] - current_offset = 0 - - for manifest in manifests: - offsets.append(current_offset) - if not manifest: - # Empty manifest — object has no vertex groups - continue - flat: list[int] = [] - for chunk_coords, vg_index in manifest: - if len(chunk_coords) != sid_ndim: - raise ArrayError( - f"Chunk coords length {len(chunk_coords)} != sid_ndim {sid_ndim}" - ) - flat.extend(chunk_coords) - flat.append(vg_index) - arr = np.array(flat, dtype=dtype) - raw = arr.tobytes() - parts.append(raw) - current_offset += len(raw) - - return b"".join(parts), np.array(offsets, dtype=np.int64) - - -def decode_object_index( - raw_bytes: bytes, - offsets: npt.NDArray[np.int64], - sid_ndim: int, -) -> list[list[tuple[tuple[int, ...], int]]]: - """Decode a byte buffer back into a list of object manifests. - - Args: - raw_bytes: Buffer from :func:`encode_object_index`. - offsets: ``(O,)`` byte offset array. - sid_ndim: Number of spatial index dimensions. - - Returns: - List of manifests, each a list of ``(chunk_coords, vg_index)`` tuples. - """ - if len(offsets) == 0: - return [] - - entry_len = sid_ndim + 1 - dtype = np.dtype(np.int64) - itemsize = dtype.itemsize - total_len = len(raw_bytes) - manifests: list[list[tuple[tuple[int, ...], int]]] = [] - - for i in range(len(offsets)): - start = int(offsets[i]) - end = int(offsets[i + 1]) if i + 1 < len(offsets) else total_len - - if start == end: - manifests.append([]) - continue - - segment = raw_bytes[start:end] - arr = np.frombuffer(segment, dtype=dtype) - - if len(arr) % entry_len != 0: - raise ArrayError( - f"Manifest segment length {len(arr)} not divisible by " - f"entry_len={entry_len} (sid_ndim={sid_ndim})" - ) - - entries: list[tuple[tuple[int, ...], int]] = [] - for j in range(0, len(arr), entry_len): - chunk_coords = tuple(int(x) for x in arr[j : j + sid_ndim]) - vg_index = int(arr[j + sid_ndim]) - entries.append((chunk_coords, vg_index)) - manifests.append(entries) - - return manifests - - -# --------------------------------------------------------------------------- -# Vertex offset encoding (vertex_group_offsets: K×1) -# --------------------------------------------------------------------------- - -def encode_vertex_offsets( - vertex_offsets: npt.NDArray[np.int64], -) -> bytes: - """Encode ``(K,)`` int64 vertex byte offsets to bytes.""" - return np.ascontiguousarray(vertex_offsets, dtype=np.int64).tobytes() - - -def decode_vertex_offsets( - raw_bytes: bytes, -) -> npt.NDArray[np.int64]: - """Decode ``(K,)`` int64 vertex byte offsets from bytes.""" - if len(raw_bytes) == 0: - return np.empty(0, dtype=np.int64) - return np.frombuffer(raw_bytes, dtype=np.int64).copy() +# v0.5 ``encode_object_index`` / ``decode_object_index`` and +# ``encode_vertex_offsets`` / ``decode_vertex_offsets`` were removed in +# 0.6.0. Object manifests are now encoded as chunk-block payloads (see +# :func:`zarr_vectors.encoding.fragments.encode_object_manifest_blocks`), +# and per-chunk vertex grouping moved to a fragment-index blob under +# ``vertex_fragments/`` (see :mod:`zarr_vectors.encoding.fragments`). +# 0.5.x stores are not readable by this build — rewrite from source. # --------------------------------------------------------------------------- diff --git a/zarr_vectors/lazy/level.py b/zarr_vectors/lazy/level.py index c0bb071..35f23a9 100644 --- a/zarr_vectors/lazy/level.py +++ b/zarr_vectors/lazy/level.py @@ -144,11 +144,11 @@ def preserves_object_ids(self) -> bool: return False @property - def shared_vertex_groups(self) -> bool: - """True when per-chunk vertex groups may be referenced by + def shared_fragments(self) -> bool: + """True when per-chunk fragments may be referenced by multiple objects' manifests (shared metavertices).""" if self._level_meta is not None: - return bool(self._level_meta.shared_vertex_groups) + return bool(self._level_meta.shared_fragments) return False @property diff --git a/zarr_vectors/multiresolution/coarsen.py b/zarr_vectors/multiresolution/coarsen.py index c77dc7a..d35219f 100644 --- a/zarr_vectors/multiresolution/coarsen.py +++ b/zarr_vectors/multiresolution/coarsen.py @@ -24,7 +24,7 @@ from zarr_vectors.constants import ( CAP_MULTISCALE_LINKS, CAP_PRESERVED_OBJECT_IDS, - CAP_SHARED_VERTEX_GROUPS, + CAP_SHARED_FRAGMENTS, COARSEN_PER_OBJECT, DEFAULT_CROSS_LEVEL_DEPTH, DEFAULT_CROSS_LEVEL_STORAGE, @@ -241,7 +241,7 @@ def _per_object_coarsen( "objects_kept": len(keep_oids), "method": COARSEN_PER_OBJECT, "preserves_object_ids": True, - "shared_vertex_groups": True, + "shared_fragments": True, } all_pos = np.concatenate(flat_positions, axis=0) @@ -294,7 +294,7 @@ def _per_object_coarsen( parent_level=source_level, preserves_object_ids=src_has_objects, inherited_num_objects=n_src_objects if src_has_objects else 0, - shared_vertex_groups=True, + shared_fragments=True, ) level_group = create_resolution_level(root, target_level, level_meta_initial) create_vertices_array(level_group, dtype="float32") @@ -362,7 +362,7 @@ def _per_object_coarsen( # --- Step 12: stamp root capability tokens -------------------------- if src_has_objects: _stamp_root_capability(root, CAP_PRESERVED_OBJECT_IDS) - _stamp_root_capability(root, CAP_SHARED_VERTEX_GROUPS) + _stamp_root_capability(root, CAP_SHARED_FRAGMENTS) # --- Step 13: emit inline ±1 cross-level link arrays ---------------- if cross_level_storage != XLEVEL_NONE and n_metavertices > 0: @@ -385,7 +385,7 @@ def _per_object_coarsen( "source_objects": n_src_objects, "method": COARSEN_PER_OBJECT, "preserves_object_ids": True, - "shared_vertex_groups": True, + "shared_fragments": True, } @@ -496,7 +496,7 @@ def _write_empty_preserve_level( parent_level=source_level, preserves_object_ids=True, inherited_num_objects=inherited_num_objects, - shared_vertex_groups=True, + shared_fragments=True, ) level_group = create_resolution_level(root, target_level, level_meta) create_vertices_array(level_group, dtype="float32") diff --git a/zarr_vectors/rechunk/rebin.py b/zarr_vectors/rechunk/rebin.py index 94b0da9..caf5a08 100644 --- a/zarr_vectors/rechunk/rebin.py +++ b/zarr_vectors/rechunk/rebin.py @@ -165,7 +165,7 @@ def rebin_level( chunk_attribute_values=level_meta.chunk_attribute_values, preserves_object_ids=level_meta.preserves_object_ids, inherited_num_objects=level_meta.inherited_num_objects, - shared_vertex_groups=level_meta.shared_vertex_groups, + shared_fragments=level_meta.shared_fragments, ) level_group.attrs.update(new_level_meta.to_dict()) diff --git a/zarr_vectors/types/graphs.py b/zarr_vectors/types/graphs.py index bc20d97..0d08069 100644 --- a/zarr_vectors/types/graphs.py +++ b/zarr_vectors/types/graphs.py @@ -30,7 +30,7 @@ LINKS_EXPLICIT, LINKS_IMPLICIT_BRANCHES, OBJIDX_STANDARD, - VERTEX_GROUP_OFFSETS, + VERTEX_FRAGMENTS, VERTICES, ) from zarr_vectors.core.arrays import ( @@ -552,7 +552,7 @@ def read_graph( _chunk_key_strs = [".".join(str(c) for c in cc) for cc in chunk_keys] _prefetch_plan: list[tuple[str, list[str]]] = [ (VERTICES, _chunk_key_strs), - (VERTEX_GROUP_OFFSETS, _chunk_key_strs), + (VERTEX_FRAGMENTS, _chunk_key_strs), (f"{LINKS}/0", _chunk_key_strs), (f"{CROSS_CHUNK_LINKS}/0", ["data"]), ] diff --git a/zarr_vectors/types/lines.py b/zarr_vectors/types/lines.py index 8747fd7..965bf1d 100644 --- a/zarr_vectors/types/lines.py +++ b/zarr_vectors/types/lines.py @@ -21,7 +21,7 @@ GEOM_LINE, LINKS_IMPLICIT_SEQUENTIAL, OBJIDX_STANDARD, - VERTEX_GROUP_OFFSETS, + VERTEX_FRAGMENTS, VERTICES, ) from zarr_vectors.core.arrays import ( @@ -425,7 +425,7 @@ def read_lines( ] _prefetch_plan: list[tuple[str, list[str]]] = [ (VERTICES, _chunk_key_strs), - (VERTEX_GROUP_OFFSETS, _chunk_key_strs), + (VERTEX_FRAGMENTS, _chunk_key_strs), ] _batched_reads_cm = level_group.batched_reads(_prefetch_plan) _batched_reads_cm.__enter__() diff --git a/zarr_vectors/types/meshes.py b/zarr_vectors/types/meshes.py index efa93ef..2677cbc 100644 --- a/zarr_vectors/types/meshes.py +++ b/zarr_vectors/types/meshes.py @@ -26,7 +26,7 @@ LINKS, LINKS_EXPLICIT, OBJIDX_STANDARD, - VERTEX_GROUP_OFFSETS, + VERTEX_FRAGMENTS, VERTICES, ) from zarr_vectors.core.arrays import ( @@ -425,7 +425,7 @@ def read_mesh( chunk_key_strs = [".".join(str(c) for c in cc) for cc in chunk_keys] _prefetch_plan: list[tuple[str, list[str]]] = [ (VERTICES, chunk_key_strs), - (VERTEX_GROUP_OFFSETS, chunk_key_strs), + (VERTEX_FRAGMENTS, chunk_key_strs), (f"{LINKS}/0", chunk_key_strs), (f"{CROSS_CHUNK_LINKS}/0", ["data"]), ] @@ -552,15 +552,19 @@ def _write_draco_chunk( blob = draco_encode_mesh(positions, faces, quantization_bits=qbits) # Store as raw bytes in the vertices chunk + from zarr_vectors.constants import VERTEX_FRAGMENTS, VERTICES from zarr_vectors.core.arrays import _chunk_key - from zarr_vectors.encoding.ragged import encode_vertex_offsets + from zarr_vectors.encoding.fragments import encode_fragments key = _chunk_key(chunk_coords) - level_group.write_bytes("vertices", key, blob) + level_group.write_bytes(VERTICES, key, blob) - # Single vertex group spanning whole chunk - v_off = np.array([0], dtype=np.int64) + # Single fragment spanning the whole chunk. Draco-encoded vertex + # blobs aren't row-addressable, so we describe the (start, count) + # in vertex *rows* as (0, num_positions) — the read path knows to + # treat Draco chunks as a single span. + num_positions = int(np.asarray(positions).shape[0]) level_group.write_bytes( - "vertex_group_offsets", key, - encode_vertex_offsets(v_off), + VERTEX_FRAGMENTS, key, + encode_fragments([(0, num_positions)]), ) diff --git a/zarr_vectors/types/points.py b/zarr_vectors/types/points.py index b898028..828f8cb 100644 --- a/zarr_vectors/types/points.py +++ b/zarr_vectors/types/points.py @@ -26,7 +26,7 @@ OBJIDX_IDENTITY, OBJIDX_STANDARD, VERTEX_ATTRIBUTES, - VERTEX_GROUP_OFFSETS, + VERTEX_FRAGMENTS, VERTICES, ) from zarr_vectors.core.arrays import ( @@ -592,14 +592,14 @@ def read_points( chunk_vg_targets: dict[ChunkCoords, list[int]] | None = None chunk_keys_set: set[ChunkCoords] = set() - # Build the prefetch plan: VERTICES + VERTEX_GROUP_OFFSETS for every + # Build the prefetch plan: VERTICES + VERTEX_FRAGMENTS for every # chunk we may touch, plus each requested attribute array. Cache # misses fall through to the sync ``read_bytes`` path so this is # purely a perf optimisation — correctness is unaffected. chunk_key_strs = [".".join(str(c) for c in cc) for cc in chunk_keys] prefetch_plan: list[tuple[str, list[str]]] = [ (VERTICES, chunk_key_strs), - (VERTEX_GROUP_OFFSETS, chunk_key_strs), + (VERTEX_FRAGMENTS, chunk_key_strs), ] if attribute_names: for attr_name in attribute_names: diff --git a/zarr_vectors/types/polylines.py b/zarr_vectors/types/polylines.py index f85ffbf..9520d7f 100644 --- a/zarr_vectors/types/polylines.py +++ b/zarr_vectors/types/polylines.py @@ -24,7 +24,7 @@ GEOM_STREAMLINE, LINKS_IMPLICIT_SEQUENTIAL, OBJIDX_STANDARD, - VERTEX_GROUP_OFFSETS, + VERTEX_FRAGMENTS, VERTICES, ) from zarr_vectors.core.arrays import ( @@ -522,7 +522,7 @@ def read_polylines( ] prefetch_plan: list[tuple[str, list[str]]] = [ (VERTICES, chunk_key_strs), - (VERTEX_GROUP_OFFSETS, chunk_key_strs), + (VERTEX_FRAGMENTS, chunk_key_strs), ] _batched_reads_cm = level_group.batched_reads(prefetch_plan)