diff --git a/CHANGELOG.md b/CHANGELOG.md index b948fa2..a438b90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ - `DacsByte::from_slice` now accepts a generic index type, removing `from_slice_with_index`. - Added `BitVectorBuilder` and zero-copy `BitVectorData` backed by `anybytes::View`. - Introduced `IndexBuilder` trait with a `Built` type and adjusted serialization helpers. -- `Rank9SelIndex` now stores its serialized bytes internally and `to_bytes` returns this buffer. - Rename crate to `succdisk` to reflect on-disk succinct data structures. - Rename crate from `succdisk` to `jerky`. - Replaced the old `BitVector` with the generic `BitVector` and renamed the @@ -20,8 +19,9 @@ - Documented the byte layout produced by `DacsByte::to_bytes` with ASCII art. - Switched `anybytes` dependency to track the upstream Git repository for the latest changes. +- Removed internal byte buffers from data structures; `WaveletMatrix`, + `DacsByte`, and `Rank9SelIndex` no longer store a `Bytes` field. - Flags are serialized before level data to eliminate padding. -- `DacsByte` stores all flags and levels in one contiguous byte buffer and `to_bytes` simply clones this buffer. - Added `get_bits` methods to `BitVectorData` and `BitVector`. - Removed deprecated `size_in_bytes` helpers. - Added `scripts/devtest.sh` and `scripts/preflight.sh` for testing and @@ -66,4 +66,3 @@ - Documented `WaveletMatrix` usage in `README.md`. - Moved README usage examples to runnable files in `examples/`. - Added `compact_vector` example showing construction and retrieval. -- WaveletMatrix now stores its serialized word buffer for zero-copy access and preallocates building memory. diff --git a/INVENTORY.md b/INVENTORY.md index 996d892..f726f62 100644 --- a/INVENTORY.md +++ b/INVENTORY.md @@ -15,3 +15,4 @@ ## Discovered Issues - `katex.html` performs manual string replacements; consider DOM-based manipulation. +- Revisit zero-copy storage strategy: avoid extra copies when storing serialized bytes in structures. diff --git a/src/bit_vector/rank9sel/inner.rs b/src/bit_vector/rank9sel/inner.rs index f4f78f4..42b20cc 100644 --- a/src/bit_vector/rank9sel/inner.rs +++ b/src/bit_vector/rank9sel/inner.rs @@ -15,7 +15,6 @@ const SELECT_ZEROS_PER_HINT: usize = SELECT_ONES_PER_HINT; /// The index implementation separated from the bit vector. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Rank9SelIndex { - bytes: Bytes, len: usize, block_rank_pairs: View<[usize]>, select1_hints: Option>, @@ -49,43 +48,16 @@ impl Rank9SelIndexBuilder Rank9SelIndex { - let mut store = Vec::new(); - store.push(self.len); - store.push(self.block_rank_pairs.len()); - store.extend_from_slice(&self.block_rank_pairs); - - if SELECT1 { - let hints = self.select1_hints.unwrap_or_default(); - store.push(hints.len()); - store.extend_from_slice(&hints); - } - - if SELECT0 { - let hints = self.select0_hints.unwrap_or_default(); - store.push(hints.len()); - store.extend_from_slice(&hints); - } - - let bytes = Bytes::from_source(store); - let mut parser = bytes.clone(); - let _len = *parser.view_prefix::().unwrap(); - let brp_len = *parser.view_prefix::().unwrap(); - let block_rank_pairs = parser.view_prefix_with_elems::<[usize]>(brp_len).unwrap(); - let select1_hints = if SELECT1 { - let l = *parser.view_prefix::().unwrap(); - Some(parser.view_prefix_with_elems::<[usize]>(l).unwrap()) - } else { - None - }; - let select0_hints = if SELECT0 { - let l = *parser.view_prefix::().unwrap(); - Some(parser.view_prefix_with_elems::<[usize]>(l).unwrap()) - } else { - None - }; - + let block_rank_pairs = Bytes::from_source(self.block_rank_pairs) + .view::<[usize]>() + .unwrap(); + let select1_hints = self + .select1_hints + .map(|v| Bytes::from_source(v).view::<[usize]>().unwrap()); + let select0_hints = self + .select0_hints + .map(|v| Bytes::from_source(v).view::<[usize]>().unwrap()); Rank9SelIndex:: { - bytes, len: self.len, block_rank_pairs, select1_hints, @@ -469,46 +441,52 @@ impl Rank9SelIndex { impl Rank9SelIndex { /// Reconstructs the index from zero-copy [`Bytes`]. - pub fn from_bytes(bytes: Bytes) -> Result { - let mut parser = bytes.clone(); - let len = *parser + pub fn from_bytes(mut bytes: Bytes) -> Result { + let len = *bytes .view_prefix::() .map_err(|e| anyhow::anyhow!(e))?; - let brp_len = *parser + let brp_len = *bytes .view_prefix::() .map_err(|e| anyhow::anyhow!(e))?; - let block_rank_pairs = parser + let block_rank_pairs = bytes .view_prefix_with_elems::<[usize]>(brp_len) .map_err(|e| anyhow::anyhow!(e))?; - let select1_hints = if SELECT1 { - let l = *parser + let has_select1 = *bytes + .view_prefix::() + .map_err(|e| anyhow::anyhow!(e))? + != 0; + let select1_hints = if has_select1 { + let l = *bytes .view_prefix::() .map_err(|e| anyhow::anyhow!(e))?; Some( - parser + bytes .view_prefix_with_elems::<[usize]>(l) .map_err(|e| anyhow::anyhow!(e))?, ) } else { None }; - let select0_hints = if SELECT0 { - let l = *parser + let has_select0 = *bytes + .view_prefix::() + .map_err(|e| anyhow::anyhow!(e))? + != 0; + let select0_hints = if has_select0 { + let l = *bytes .view_prefix::() .map_err(|e| anyhow::anyhow!(e))?; Some( - parser + bytes .view_prefix_with_elems::<[usize]>(l) .map_err(|e| anyhow::anyhow!(e))?, ) } else { None }; - if !parser.as_ref().is_empty() { - return Err(anyhow::anyhow!("extra bytes")); + if has_select1 != SELECT1 || has_select0 != SELECT0 { + return Err(anyhow::anyhow!("mismatched hint flags")); } Ok(Self { - bytes, len, block_rank_pairs, select1_hints, @@ -518,7 +496,25 @@ impl Rank9SelIndex { /// Serializes the index metadata and data into a [`Bytes`] buffer. pub fn to_bytes(&self) -> Bytes { - self.bytes.clone() + let mut store: Vec = Vec::new(); + store.push(self.len); + store.push(self.block_rank_pairs.len()); + store.extend_from_slice(self.block_rank_pairs.as_ref()); + if let Some(ref v) = self.select1_hints { + store.push(1); + store.push(v.len()); + store.extend_from_slice(v.as_ref()); + } else { + store.push(0); + } + if let Some(ref v) = self.select0_hints { + store.push(1); + store.push(v.len()); + store.extend_from_slice(v.as_ref()); + } else { + store.push(0); + } + Bytes::from_source(store) } } diff --git a/src/char_sequences/wavelet_matrix.rs b/src/char_sequences/wavelet_matrix.rs index d106cb5..9a8fe9a 100644 --- a/src/char_sequences/wavelet_matrix.rs +++ b/src/char_sequences/wavelet_matrix.rs @@ -8,7 +8,8 @@ use anybytes::Bytes; use anyhow::{anyhow, Result}; use crate::bit_vector::{ - Access, BitVector, BitVectorData, BitVectorIndex, NumBits, Rank, Select, WORD_LEN, + Access, BitVector, BitVectorBuilder, BitVectorData, BitVectorIndex, NumBits, Rank, Select, + WORD_LEN, }; use crate::int_vectors::{CompactVector, CompactVectorBuilder}; use crate::utils; @@ -55,21 +56,10 @@ use crate::utils; /// # References /// /// - F. Claude, and G. Navarro, "The Wavelet Matrix," In SPIRE 2012. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Default, Debug, Clone, PartialEq, Eq)] pub struct WaveletMatrix { layers: Vec>, alph_size: usize, - bytes: Bytes, -} - -impl Default for WaveletMatrix { - fn default() -> Self { - Self { - layers: Vec::new(), - alph_size: 0, - bytes: Bytes::empty(), - } - } } /// Metadata describing the serialized form of a [`WaveletMatrix`]. @@ -99,79 +89,55 @@ where return Err(anyhow!("seq must not be empty.")); } - let len = seq.len(); let alph_size = seq.iter().max().unwrap() + 1; let alph_width = utils::needed_bits(alph_size); - let num_words = (len + WORD_LEN - 1) / WORD_LEN; - - let mut store = vec![0usize; alph_width * num_words]; let mut zeros = seq; let mut ones = CompactVector::new(alph_width)?.freeze(); + let mut layers = vec![]; for depth in 0..alph_width { let mut next_zeros = CompactVectorBuilder::new(alph_width).unwrap(); let mut next_ones = CompactVectorBuilder::new(alph_width).unwrap(); - let layer = &mut store[depth * num_words..(depth + 1) * num_words]; - let mut pos = 0; - Self::filter_into( + let mut bv = BitVectorBuilder::new(); + Self::filter( &zeros, alph_width - depth - 1, &mut next_zeros, &mut next_ones, - layer, - &mut pos, + &mut bv, ); - Self::filter_into( + Self::filter( &ones, alph_width - depth - 1, &mut next_zeros, &mut next_ones, - layer, - &mut pos, + &mut bv, ); zeros = next_zeros.freeze(); ones = next_ones.freeze(); + let bits = bv.freeze::(); + layers.push(bits); } - let bytes = Bytes::from_source(store); - let mut layer_bytes = bytes.clone(); - let mut layers = Vec::with_capacity(alph_width); - for _ in 0..alph_width { - let words = layer_bytes - .view_prefix_with_elems::<[usize]>(num_words) - .map_err(|e| anyhow!(e))?; - let data = BitVectorData { words, len }; - let index = I::build(&data); - layers.push(BitVector::new(data, index)); - } - - Ok(Self { - layers, - alph_size, - bytes, - }) + Ok(Self { layers, alph_size }) } - fn filter_into( + fn filter( seq: &CompactVector, shift: usize, next_zeros: &mut CompactVectorBuilder, next_ones: &mut CompactVectorBuilder, - layer: &mut [usize], - pos: &mut usize, + bv: &mut BitVectorBuilder, ) { for val in seq.iter() { let bit = ((val >> shift) & 1) == 1; + bv.push_bit(bit); if bit { - let idx = *pos / WORD_LEN; - let sh = *pos % WORD_LEN; - layer[idx] |= 1usize << sh; next_ones.push_int(val).unwrap(); } else { next_zeros.push_int(val).unwrap(); } - *pos += 1; } } @@ -611,21 +577,24 @@ where /// Serializes the sequence into a [`Bytes`] buffer along with its metadata. pub fn to_bytes(&self) -> (WaveletMatrixMeta, Bytes) { + let mut store: Vec = Vec::new(); + for layer in &self.layers { + store.extend_from_slice(layer.data.words()); + } let meta = WaveletMatrixMeta { alph_size: self.alph_size, alph_width: self.alph_width(), len: self.len(), }; - (meta, self.bytes.clone()) + (meta, Bytes::from_source(store)) } /// Reconstructs the sequence from metadata and a zero-copy [`Bytes`] buffer. - pub fn from_bytes(meta: WaveletMatrixMeta, bytes: Bytes) -> Result { + pub fn from_bytes(meta: WaveletMatrixMeta, mut bytes: Bytes) -> Result { let mut layers = Vec::with_capacity(meta.alph_width); let num_words = (meta.len + WORD_LEN - 1) / WORD_LEN; - let mut slice = bytes.clone(); for _ in 0..meta.alph_width { - let words = slice + let words = bytes .view_prefix_with_elems::<[usize]>(num_words) .map_err(|e| anyhow!(e))?; let data = BitVectorData { @@ -638,7 +607,6 @@ where Ok(Self { layers, alph_size: meta.alph_size, - bytes, }) } } diff --git a/src/int_vectors/dacs_byte.rs b/src/int_vectors/dacs_byte.rs index bb2d3d6..4e91393 100644 --- a/src/int_vectors/dacs_byte.rs +++ b/src/int_vectors/dacs_byte.rs @@ -58,7 +58,6 @@ const MAX_LEVELS: usize = (usize::BITS as usize + LEVEL_WIDTH - 1) / LEVEL_WIDTH /// codes." Information Processing & Management, 49(1), 392-404, 2013. #[derive(Clone, PartialEq, Eq)] pub struct DacsByte { - bytes: Bytes, data: Vec>, flags: Vec>, } @@ -132,96 +131,41 @@ impl DacsByte { assert_ne!(num_levels, 0); if num_levels == 1 { - let buf: Vec = vals + let data: Vec<_> = vals .iter() .map(|x| u8::try_from(x.to_usize().unwrap()).unwrap()) .collect(); - let bytes = Bytes::from_source(buf); - let data = vec![bytes.clone().view::<[u8]>().unwrap()]; return Ok(Self { - bytes, - data, + data: vec![Bytes::from_source(data).view::<[u8]>().unwrap()], flags: vec![], }); } - let mut level_data = vec![vec![]; num_levels]; - let mut flag_builders = vec![BitVectorBuilder::new(); num_levels - 1]; + let mut data = vec![vec![]; num_levels]; + let mut flags = vec![BitVectorBuilder::new(); num_levels - 1]; for x in vals { let mut x = x.to_usize().unwrap(); for j in 0..num_levels { - level_data[j].push(u8::try_from(x & LEVEL_MASK).unwrap()); + data[j].push(u8::try_from(x & LEVEL_MASK).unwrap()); x >>= LEVEL_WIDTH; if j == num_levels - 1 { assert_eq!(x, 0); break; } else if x == 0 { - flag_builders[j].push_bit(false); + flags[j].push_bit(false); break; } - flag_builders[j].push_bit(true); + flags[j].push_bit(true); } } - use std::mem::size_of; - - let usize_size = size_of::(); - let mut flag_bytes = Vec::new(); - let mut flag_info = Vec::with_capacity(flag_builders.len()); - for b in flag_builders.into_iter() { - let (len_bits, bytes) = b.into_bytes(); - let num_words = bytes.as_ref().len() / usize_size; - flag_info.push((len_bits, num_words)); - flag_bytes.push(bytes); - } - - let level_lens: Vec = level_data.iter().map(|v| v.len()).collect(); - - let total_flags: usize = flag_info.iter().map(|(_, w)| w * usize_size).sum(); - let total_levels: usize = level_lens.iter().sum(); - let mut buf = Vec::with_capacity(total_flags + total_levels); - let mut flag_offsets = Vec::with_capacity(flag_bytes.len()); - for bytes in &flag_bytes { - flag_offsets.push(buf.len()); - buf.extend_from_slice(bytes.as_ref()); - } - let mut level_offsets = Vec::with_capacity(level_data.len()); - for level in &level_data { - level_offsets.push(buf.len()); - buf.extend_from_slice(level); - } - - let bytes = Bytes::from_source(buf); - let mut flags = Vec::with_capacity(flag_offsets.len()); - for ((len_bits, num_words), offset) in flag_info.into_iter().zip(flag_offsets) { - let start = offset; - let end = start + num_words * usize_size; - let words_view = bytes - .slice_to_bytes(&bytes.as_ref()[start..end]) - .ok_or_else(|| anyhow!("invalid slice"))? - .view::<[usize]>() - .map_err(|e| anyhow!(e))?; - let data = bit_vector::BitVectorData { - words: words_view, - len: len_bits, - }; - let index = I::build(&data); - flags.push(bit_vector::BitVector { data, index }); - } - - let mut data = Vec::with_capacity(level_offsets.len()); - for (offset, len) in level_offsets.into_iter().zip(level_lens) { - let start = offset; - let end = start + len; - let view_bytes = bytes - .slice_to_bytes(&bytes.as_ref()[start..end]) - .ok_or_else(|| anyhow!("invalid slice"))?; - let view = view_bytes.view::<[u8]>().map_err(|e| anyhow!(e))?; - data.push(view); - } - - Ok(Self { bytes, data, flags }) + let flags = flags.into_iter().map(|bvb| bvb.freeze::()).collect(); + let data = data + .into_iter() + .map(|v| Bytes::from_source(v).view::<[u8]>().unwrap()) + .collect(); + Ok(Self { data, flags }) } /// Creates an iterator for enumerating integers. @@ -291,13 +235,24 @@ impl DacsByte { }) .collect::>(); + let mut buf: Vec = Vec::new(); + for flag in &self.flags { + for &word in flag.data.words.as_ref() { + buf.extend_from_slice(&word.to_ne_bytes()); + } + } + + for level in &self.data { + buf.extend_from_slice(level.as_ref()); + } + ( DacsByteMeta { num_levels: self.data.len(), level_lens, flag_meta, }, - self.bytes.clone(), + Bytes::from_source(buf), ) } @@ -352,7 +307,7 @@ impl DacsByte { cursor += len; } - Ok(Self { bytes, data, flags }) + Ok(Self { data, flags }) } } @@ -360,7 +315,6 @@ impl Default for DacsByte { fn default() -> Self { Self { // Needs a single level at least. - bytes: Bytes::empty(), data: vec![Bytes::empty().view::<[u8]>().unwrap()], flags: vec![], }