diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bddc39..354217e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ # Changelog ## Unreleased +- Embedded section handles in `BitVectorData` and added `BitVectorDataMeta` with + `Serializable` support for both `BitVectorData` and `BitVector`, enabling + zero-copy reconstruction from arena metadata. - Introduced a `Serializable` trait for metadata-based reconstruction and implemented it for `CompactVector`, `DacsByte`, and `WaveletMatrix`. - Audited `DacsByte` and `WaveletMatrix` to leverage `SectionHandle::view` diff --git a/INVENTORY.md b/INVENTORY.md index ce43e75..1844470 100644 --- a/INVENTORY.md +++ b/INVENTORY.md @@ -47,7 +47,6 @@ manual offset math in complex `from_bytes` implementations like `DacsByte`. - Investigate slimming `DacsByte` per-level metadata to avoid storing unused flag handles for the last level. - ## Discovered Issues - `katex.html` performs manual string replacements; consider DOM-based manipulation. - Revisit zero-copy storage strategy: avoid extra copies when storing serialized bytes in structures. diff --git a/README.md b/README.md index dad9e0a..fefefca 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ RUSTDOCFLAGS="--html-in-header katex.html" cargo doc --no-deps is backed by `anybytes::View`. Metadata describing a stored sequence includes [`SectionHandle`](anybytes::area::SectionHandle)s so the raw `Bytes` returned by `ByteArea::freeze` can be handed to -`BitVectorData::from_bytes` for zero‑copy reconstruction. +`BitVectorData::from_bytes` with its `BitVectorDataMeta` for zero‑copy reconstruction. Types following this pattern implement the [`Serializable`](src/serialization.rs) trait, which exposes a `metadata` accessor and a `from_bytes` constructor. diff --git a/src/bit_vector/mod.rs b/src/bit_vector/mod.rs index c557835..8642f43 100644 --- a/src/bit_vector/mod.rs +++ b/src/bit_vector/mod.rs @@ -126,6 +126,7 @@ pub trait Select { /// The number of bits in a machine word. pub const WORD_LEN: usize = core::mem::size_of::() * 8; +use crate::serialization::Serializable; use anybytes::{area::SectionHandle, ByteArea, Bytes, Section, SectionWriter, View}; use anyhow::{anyhow, Result}; @@ -296,11 +297,13 @@ impl<'a> BitVectorBuilder<'a> { } fn into_data(self) -> BitVectorData { + let handle = self.words.handle(); let words_bytes = self.words.freeze().expect("freeze section"); let words = words_bytes.view::<[u64]>().unwrap(); BitVectorData { words, len: self.len, + handle: Some(handle), } } @@ -318,12 +321,31 @@ impl<'a> BitVectorBuilder<'a> { } /// Immutable bit vector data without auxiliary indexes. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone)] pub struct BitVectorData { /// Underlying machine words storing bit data. pub words: View<[u64]>, /// Number of valid bits in `words`. pub len: usize, + /// Handle to the backing words section, if available. + pub handle: Option>, +} + +impl PartialEq for BitVectorData { + fn eq(&self, other: &Self) -> bool { + self.words == other.words && self.len == other.len + } +} + +impl Eq for BitVectorData {} + +/// Metadata describing a [`BitVectorData`] stored in a [`ByteArea`]. +#[derive(Debug, Clone, Copy)] +pub struct BitVectorDataMeta { + /// Number of bits stored. + pub len: usize, + /// Handle to the raw `u64` words backing the vector. + pub handle: SectionHandle, } impl Default for BitVectorData { @@ -331,6 +353,7 @@ impl Default for BitVectorData { Self { words: Bytes::empty().view::<[u64]>().unwrap(), len: 0, + handle: None, } } } @@ -348,10 +371,23 @@ impl BitVectorData { builder.into_data() } - /// Reconstructs the data from zero-copy [`Bytes`]. - pub fn from_bytes(len: usize, bytes: Bytes) -> Result { - let words = bytes.view::<[u64]>().map_err(|e| anyhow::anyhow!(e))?; - Ok(Self { words, len }) + /// Serializes the data into a [`Bytes`] buffer and accompanying metadata. + /// Returns metadata describing this data. + pub fn metadata(&self) -> BitVectorDataMeta { + BitVectorDataMeta { + len: self.len, + handle: self.handle.expect("missing handle"), + } + } + + /// Reconstructs the data from zero-copy [`Bytes`] and its metadata. + pub fn from_bytes(meta: BitVectorDataMeta, bytes: Bytes) -> Result { + let words = meta.handle.view(&bytes).map_err(|e| anyhow::anyhow!(e))?; + Ok(Self { + words, + len: meta.len, + handle: Some(meta.handle), + }) } /// Returns the number of bits stored. @@ -393,6 +429,18 @@ impl BitVectorData { } } +impl Serializable for BitVectorData { + type Meta = BitVectorDataMeta; + + fn metadata(&self) -> Self::Meta { + BitVectorData::metadata(self) + } + + fn from_bytes(meta: Self::Meta, bytes: Bytes) -> Result { + BitVectorData::from_bytes(meta, bytes) + } +} + impl From for BitVector { fn from(data: BitVectorData) -> Self { BitVector::new(data, NoIndex) @@ -514,7 +562,7 @@ impl BitVectorIndex for NoIndex { } /// Immutable bit vector data combined with an auxiliary index. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone)] pub struct BitVector { /// Raw data bits. pub data: BitVectorData, @@ -522,6 +570,14 @@ pub struct BitVector { pub index: I, } +impl PartialEq for BitVector { + fn eq(&self, other: &Self) -> bool { + self.data == other.data && self.index == other.index + } +} + +impl Eq for BitVector {} + /// Iterator over bits in a [`BitVector`]. pub struct Iter<'a, I> { bv: &'a BitVector, @@ -578,6 +634,11 @@ impl BitVector { pub fn to_vec(&self) -> Vec { self.iter().collect() } + + /// Returns the handle to the backing words section, if available. + pub fn handle(&self) -> Option> { + self.data.handle + } } impl NumBits for BitVector { @@ -616,6 +677,33 @@ impl Select for BitVector { } } +impl BitVector { + /// Serializes the vector into a [`Bytes`] buffer and accompanying metadata. + /// Returns metadata describing this vector's data. + pub fn metadata(&self) -> BitVectorDataMeta { + ::metadata(self) + } + + /// Reconstructs the vector from zero-copy [`Bytes`] and its metadata. + pub fn from_bytes(meta: BitVectorDataMeta, bytes: Bytes) -> Result { + ::from_bytes(meta, bytes) + } +} + +impl Serializable for BitVector { + type Meta = BitVectorDataMeta; + + fn metadata(&self) -> Self::Meta { + self.data.metadata() + } + + fn from_bytes(meta: Self::Meta, bytes: Bytes) -> Result { + let data = BitVectorData::from_bytes(meta, bytes)?; + let index = I::build(&data); + Ok(BitVector::new(data, index)) + } +} + pub use rank9sel::Rank9SelIndex; #[cfg(test)] @@ -659,10 +747,9 @@ mod tests { let expected: BitVector = BitVectorData::from_bits([true, false, true, true, false]).into(); let bv: BitVector = builder.freeze::(); - let len = bv.data.len; - let bytes = bv.data.words.clone().bytes(); - let data = BitVectorData::from_bytes(len, bytes).unwrap(); - let other: BitVector = data.into(); + let meta = bv.metadata(); + let bytes = area.freeze().unwrap(); + let other: BitVector = BitVector::from_bytes(meta, bytes).unwrap(); assert_eq!(expected, other); } diff --git a/src/char_sequences/wavelet_matrix.rs b/src/char_sequences/wavelet_matrix.rs index 2ca540b..3e130dd 100644 --- a/src/char_sequences/wavelet_matrix.rs +++ b/src/char_sequences/wavelet_matrix.rs @@ -8,7 +8,8 @@ use anybytes::{area::SectionHandle, ByteArea, Bytes, Section, SectionWriter}; use anyhow::{anyhow, Result}; use crate::bit_vector::{ - Access, BitVector, BitVectorBuilder, BitVectorData, BitVectorIndex, NumBits, Rank, Select, + Access, BitVector, BitVectorBuilder, BitVectorData, BitVectorDataMeta, BitVectorIndex, NumBits, + Rank, Select, }; use crate::serialization::Serializable; use crate::utils; @@ -806,11 +807,13 @@ impl Serializable for WaveletMatrix { let handles_view = meta.layers.view(&bytes).map_err(anyhow::Error::from)?; let mut layers = Vec::with_capacity(meta.alph_width); for h in handles_view.as_ref() { - let words = h.view(&bytes).map_err(anyhow::Error::from)?; - let data = BitVectorData { - words, - len: meta.len, - }; + let data = BitVectorData::from_bytes( + BitVectorDataMeta { + len: meta.len, + handle: *h, + }, + bytes.clone(), + )?; let index = I::build(&data); layers.push(BitVector::new(data, index)); } diff --git a/src/int_vectors/compact_vector.rs b/src/int_vectors/compact_vector.rs index 75c3e37..ce2dbd3 100644 --- a/src/int_vectors/compact_vector.rs +++ b/src/int_vectors/compact_vector.rs @@ -5,8 +5,7 @@ use anyhow::{anyhow, Result}; use num_traits::ToPrimitive; use std::iter::ExactSizeIterator; -use crate::bit_vector::BitVectorBuilder; -use crate::bit_vector::{BitVector, BitVectorData, NoIndex}; +use crate::bit_vector::{BitVector, BitVectorBuilder, BitVectorData, BitVectorDataMeta, NoIndex}; use crate::int_vectors::prelude::*; use crate::serialization::Serializable; use crate::utils; @@ -136,13 +135,11 @@ impl<'a> CompactVectorBuilder<'a> { /// # } /// ``` pub fn freeze(self) -> CompactVector { - let handle = self.chunks.handle(); let chunks: BitVector = self.chunks.freeze::(); CompactVector { chunks, len: self.len, width: self.width, - handle, } } } @@ -178,16 +175,11 @@ pub struct CompactVector { chunks: BitVector, len: usize, width: usize, - handle: SectionHandle, } impl PartialEq for CompactVector { fn eq(&self, other: &Self) -> bool { - self.chunks == other.chunks - && self.len == other.len - && self.width == other.width - && self.handle.offset == other.handle.offset - && self.handle.len == other.handle.len + self.chunks == other.chunks && self.len == other.len && self.width == other.width } } @@ -198,13 +190,11 @@ impl Default for CompactVector { let mut area = ByteArea::new().expect("byte area"); let mut sections = area.sections(); let builder = BitVectorBuilder::with_capacity(0, &mut sections).unwrap(); - let handle = builder.handle(); let chunks = builder.freeze::(); Self { chunks, len: 0, width: 0, - handle, } } } @@ -455,23 +445,24 @@ impl Serializable for CompactVector { CompactVectorMeta { len: self.len, width: self.width, - handle: self.handle, + handle: self.chunks.handle().expect("missing handle"), } } fn from_bytes(meta: Self::Meta, bytes: Bytes) -> Result { let data_len = meta.len * meta.width; - let words_view = meta.handle.view(&bytes).map_err(|e| anyhow!(e))?; - let data = BitVectorData { - words: words_view, - len: data_len, - }; + let data = BitVectorData::from_bytes( + BitVectorDataMeta { + len: data_len, + handle: meta.handle, + }, + bytes, + )?; let chunks = BitVector::new(data, NoIndex); Ok(Self { chunks, len: meta.len, width: meta.width, - handle: meta.handle, }) } } diff --git a/src/int_vectors/dacs_byte.rs b/src/int_vectors/dacs_byte.rs index 1a4a283..31f22a1 100644 --- a/src/int_vectors/dacs_byte.rs +++ b/src/int_vectors/dacs_byte.rs @@ -354,16 +354,15 @@ impl Serializable for DacsByte { let mut data = Vec::with_capacity(meta.num_levels); for (idx, info) in infos.as_ref().iter().enumerate() { if idx + 1 < meta.num_levels { - let words = info.flag.view(&bytes).map_err(anyhow::Error::from)?; - let bv_data = bit_vector::BitVectorData { - words, - len: info.flag_bits, - }; + let bv_data = bit_vector::BitVectorData::from_bytes( + bit_vector::BitVectorDataMeta { + len: info.flag_bits, + handle: info.flag, + }, + bytes.clone(), + )?; let index = I::build(&bv_data); - flags.push(bit_vector::BitVector { - data: bv_data, - index, - }); + flags.push(bit_vector::BitVector::new(bv_data, index)); } let lvl_view = info.level.view(&bytes).map_err(anyhow::Error::from)?; data.push(lvl_view);