diff --git a/CHANGELOG.md b/CHANGELOG.md index ab7a242..af1928f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ All significant changes to this project will be documented in this file. * `CountMinSketch` with unsigned values now supports `halve` and `decay` operations. * `CpcSketch` and `CpcUnion` are now available for cardinality estimation. -* `FrequentItemsSketch` now supports serde for `u64` value. +* `FrequentItemsSketch` now supports serde for any value implement `FrequentItemValue` (builtin supports for `i64`, `u64`, and `String`). +* Expose `codec::SketchBytes`, `codec::SketchSlice`, and `FrequentItemValue` as public API. ## v0.2.0 (2026-01-14) diff --git a/datasketches/src/bloom/builder.rs b/datasketches/src/bloom/builder.rs index 1918a13..6cb0158 100644 --- a/datasketches/src/bloom/builder.rs +++ b/datasketches/src/bloom/builder.rs @@ -22,9 +22,9 @@ use crate::hash::DEFAULT_UPDATE_SEED; /// Builder for creating [`BloomFilter`] instances. /// /// Provides two construction modes: -/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate +/// * [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate /// (recommended) -/// - [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual) +/// * [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual) #[derive(Debug, Clone)] pub struct BloomFilterBuilder { num_bits: u64, @@ -52,8 +52,8 @@ impl BloomFilterBuilder { /// /// # Arguments /// - /// - `max_items`: Maximum expected number of distinct items - /// - `fpp`: Target false positive probability (e.g., 0.01 for 1%) + /// * `max_items`: Maximum expected number of distinct items + /// * `fpp`: Target false positive probability (e.g., 0.01 for 1%) /// /// # Panics /// @@ -95,14 +95,14 @@ impl BloomFilterBuilder { /// /// # Arguments /// - /// - `num_bits`: Total number of bits in the filter - /// - `num_hashes`: Number of hash functions to use + /// * `num_bits`: Total number of bits in the filter + /// * `num_hashes`: Number of hash functions to use /// /// # Panics /// /// Panics if any of: - /// - `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`] - /// - `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MIN_NUM_HASHES`] + /// * `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`] + /// * `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MAX_NUM_HASHES`] /// /// # Examples /// diff --git a/datasketches/src/bloom/mod.rs b/datasketches/src/bloom/mod.rs index e5ac69e..8e58139 100644 --- a/datasketches/src/bloom/mod.rs +++ b/datasketches/src/bloom/mod.rs @@ -23,10 +23,10 @@ //! //! # Properties //! -//! - **No false negatives**: If an item was inserted, `contains()` will always return `true` -//! - **Possible false positives**: `contains()` may return `true` for items never inserted -//! - **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically -//! - **Linear space**: Size is proportional to the expected number of distinct items +//! * **No false negatives**: If an item was inserted, `contains()` will always return `true` +//! * **Possible false positives**: `contains()` may return `true` for items never inserted +//! * **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically +//! * **Linear space**: Size is proportional to the expected number of distinct items //! //! # Usage //! @@ -109,15 +109,15 @@ //! //! # Implementation Details //! -//! - Uses XXHash64 for hashing -//! - Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions -//! - Bits packed efficiently in `u64` words -//! - Compatible serialization format (family ID: 21) +//! * Uses XXHash64 for hashing +//! * Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions +//! * Bits packed efficiently in `u64` words +//! * Compatible serialization format (family ID: 21) //! //! # References //! -//! - Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors" -//! - Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom +//! * Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors" +//! * Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom //! Filter" mod builder; diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs index 304c450..d7332e0 100644 --- a/datasketches/src/bloom/sketch.rs +++ b/datasketches/src/bloom/sketch.rs @@ -33,9 +33,9 @@ const EMPTY_FLAG_MASK: u8 = 1 << 2; /// A Bloom filter for probabilistic set membership testing. /// /// Provides fast membership queries with: -/// - No false negatives (inserted items always return `true`) -/// - Tunable false positive rate -/// - Constant space usage +/// * No false negatives (inserted items always return `true`) +/// * Tunable false positive rate +/// * Constant space usage /// /// Use [`super::BloomFilterBuilder`] to construct instances. #[derive(Debug, Clone, PartialEq)] @@ -54,8 +54,8 @@ impl BloomFilter { /// Tests whether an item is possibly in the set. /// /// Returns: - /// - `true`: Item was **possibly** inserted (or false positive) - /// - `false`: Item was **definitely not** inserted + /// * `true`: Item was **possibly** inserted (or false positive) + /// * `false`: Item was **definitely not** inserted /// /// # Examples /// @@ -290,8 +290,8 @@ impl BloomFilter { /// /// Uses the approximation: `load_factor^k` /// where: - /// - load_factor = fraction of bits set (bits_used / capacity) - /// - k = num_hashes + /// * load_factor = fraction of bits set (bits_used / capacity) + /// * k = num_hashes /// /// This assumes uniform bit distribution and is more accurate than /// trying to estimate insertion count from the load factor. @@ -307,9 +307,9 @@ impl BloomFilter { /// Checks if two filters are compatible for merging. /// /// Filters are compatible if they have the same: - /// - Capacity (number of bits) - /// - Number of hash functions - /// - Seed + /// * Capacity (number of bits) + /// * Number of hash functions + /// * Seed pub fn is_compatible(&self, other: &Self) -> bool { self.bit_array.len() == other.bit_array.len() && self.num_hashes == other.num_hashes @@ -379,9 +379,9 @@ impl BloomFilter { /// # Errors /// /// Returns an error if: - /// - The data is truncated or corrupted - /// - The family ID doesn't match (not a Bloom filter) - /// - The serial version is unsupported + /// * The data is truncated or corrupted + /// * The family ID doesn't match (not a Bloom filter) + /// * The serial version is unsupported /// /// # Examples /// @@ -501,8 +501,8 @@ impl BloomFilter { /// Computes the two base hash values using XXHash64. /// /// Uses a two-hash approach: - /// - h0 = XXHash64(item, seed) - /// - h1 = XXHash64(item, h0) + /// * h0 = XXHash64(item, seed) + /// * h1 = XXHash64(item, h0) fn compute_hash(&self, item: &T) -> (u64, u64) { // First hash with the configured seed let mut hasher = XxHash64::with_seed(self.seed); diff --git a/datasketches/src/common/binomial_bounds.rs b/datasketches/src/common/binomial_bounds.rs index afeeddc..c2b7d74 100644 --- a/datasketches/src/common/binomial_bounds.rs +++ b/datasketches/src/common/binomial_bounds.rs @@ -274,9 +274,9 @@ static UB_EQUIV_TABLE: [f64; 363] = [ /// /// # Arguments /// -/// * `num_samples` - The number of samples in the sample set. -/// * `theta` - The sampling probability. Must be in the range (0.0, 1.0]. -/// * `num_std_dev` - The number of standard deviations for confidence bounds. +/// * `num_samples`: The number of samples in the sample set. +/// * `theta`: The sampling probability. Must be in the range (0.0, 1.0]. +/// * `num_std_dev`: The number of standard deviations for confidence bounds. /// /// # Returns /// @@ -301,11 +301,11 @@ pub(crate) fn lower_bound( /// /// # Arguments /// -/// * `num_samples` - The number of samples in the sample set. -/// * `theta` - The sampling probability. Must be in the range `(0.0, 1.0]`. -/// * `num_std_dev` - The number of standard deviations for confidence bounds. -/// * `no_data_seen` - This is normally false. However, in the case where you have zero samples and -/// a theta < 1.0, this flag enables the distinction between a virgin case when no actual data has +/// * `num_samples`: The number of samples in the sample set. +/// * `theta`: The sampling probability. Must be in the range `(0.0, 1.0]`. +/// * `num_std_dev`: The number of standard deviations for confidence bounds. +/// * `no_data_seen`: This is normally false. However, in the case where you have zero samples and a +/// theta < 1.0, this flag enables the distinction between a virgin case when no actual data has /// been seen and the case where the estimate may be zero but an upper error bound may still /// exist. /// @@ -367,16 +367,16 @@ fn cont_classic_ub(num_samples: u64, theta: f64, num_std_devs: f64) -> f64 { /// /// # Arguments /// -/// * `num_samples` - The number of observed samples (k). Must be >= 1. -/// * `p` - The sampling probability. Must satisfy: 0 < p < 1. -/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1. +/// * `num_samples`: The number of observed samples (k). Must be >= 1. +/// * `p`: The sampling probability. Must satisfy: 0 < p < 1. +/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1. /// /// # Invariants /// -/// - `num_samples >= 1` -/// - `0.0 < p < 1.0` -/// - `0.0 < delta < 1.0` -/// - `(num_samples / p) < 500.0` (enforced for performance and numerical stability) +/// * `num_samples >= 1` +/// * `0.0 < p < 1.0` +/// * `0.0 < delta < 1.0` +/// * `(num_samples / p) < 500.0` (enforced for performance and numerical stability) /// /// # Returns /// @@ -413,15 +413,15 @@ fn special_n_star(num_samples: u64, p: f64, delta: f64) -> Result { /// /// # Arguments /// -/// * `num_samples` - The number of observed samples (k). Must be >= 1. -/// * `p` - The sampling probability. Must satisfy: 0 < p < 1. -/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1. +/// * `num_samples`: The number of observed samples (k). Must be >= 1. +/// * `p`: The sampling probability. Must satisfy: 0 < p < 1. +/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1. /// /// # Invariants /// -/// - `num_samples >= 1` -/// - `0.0 < p < 1.0` -/// - `0.0 < delta < 1.0` +/// * `num_samples >= 1` +/// * `0.0 < p < 1.0` +/// * `0.0 < delta < 1.0` /// /// # Returns /// @@ -452,14 +452,14 @@ fn special_n_prime_b(num_samples: u64, p: f64, delta: f64) -> Result /// /// # Arguments /// -/// * `num_samples` - The number of observed samples (k). Must be >= 1. -/// * `p` - The sampling probability. Must satisfy: 0 < p < 1. -/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1. +/// * `num_samples`: The number of observed samples (k). Must be >= 1. +/// * `p`: The sampling probability. Must satisfy: 0 < p < 1. +/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1. /// /// # Invariants /// -/// - `(num_samples / p) < 500.0` (enforced for performance) -/// - A super-small delta could also make it slow. +/// * `(num_samples / p) < 500.0` (enforced for performance) +/// * A super-small delta could also make it slow. fn special_n_prime_f(num_samples: u64, p: f64, delta: f64) -> Result { // Use a different algorithm if the following is true; this one will be too slow, or worse. if (num_samples as f64 / p) >= 500.0 { diff --git a/datasketches/src/countmin/sketch.rs b/datasketches/src/countmin/sketch.rs index 3bc50a3..2116b75 100644 --- a/datasketches/src/countmin/sketch.rs +++ b/datasketches/src/countmin/sketch.rs @@ -75,10 +75,10 @@ impl CountMinSketch { /// # Panics /// /// Panics if any of: - /// - `num_hashes` is 0 - /// - `num_buckets` is less than 3 - /// - the total table size exceeds the supported limit - /// - the computed seed hash is zero + /// * `num_hashes` is 0 + /// * `num_buckets` is less than 3 + /// * the total table size exceeds the supported limit + /// * the computed seed hash is zero /// /// # Examples /// diff --git a/datasketches/src/frequencies/mod.rs b/datasketches/src/frequencies/mod.rs index 93fb5e4..d5e7cf0 100644 --- a/datasketches/src/frequencies/mod.rs +++ b/datasketches/src/frequencies/mod.rs @@ -17,16 +17,66 @@ //! Frequency sketches for finding heavy hitters in data streams. //! -//! This module implements the Frequent Items sketch from Apache DataSketches. It tracks -//! approximate frequencies in a stream and can report heavy hitters with explicit -//! error guarantees (no false negatives or no false positives). +//! # Overview //! -//! For background, see the Java documentation: -//! +//! This sketch is based on the paper ["A High-Performance Algorithm for Identifying Frequent Items +//! in Data Streams"](https://arxiv.org/abs/1705.07001) by Daniel Anderson, Pryce Bevan, Kevin Lang, +//! Edo Liberty, Lee Rhodes, and Justin Thaler. //! -//! # Usage +//! This sketch is useful for tracking approximate frequencies of items of type `T` that implements +//! [`FrequentItemValue`], with optional associated counts (`T` item, `u64` count) that are members +//! of a multiset of such items. The true frequency of an item is defined to be the sum of +//! associated counts. //! -//! ```rust +//! This implementation provides the following capabilities: +//! * Estimate the frequency of an item. +//! * Return upper and lower bounds of any item, such that the true frequency is always between the +//! upper and lower bounds. +//! * Return a global maximum error that holds for all items in the stream. +//! * Return an array of frequent items that qualify either [`ErrorType::NoFalsePositives`] or +//! [`ErrorType::NoFalseNegatives`]. +//! * Merge itself with another sketch created from this module. +//! * Serialize to bytes, or deserialize from bytes, for storage or transmission. +//! +//! # Accuracy +//! +//! If fewer than `0.75 * max_map_size` different items are inserted into the sketch the estimated +//! frequencies returned by the sketch will be exact. +//! +//! The logic of the frequent items sketch is such that the stored counts and true counts are never +//! too different. More specifically, for any item, the sketch can return an estimate of the true +//! frequency of item, along with upper and lower bounds on the frequency (that hold +//! deterministically). +//! +//! For this implementation and for a specific active item, it is guaranteed that the true frequency +//! will be between the Upper Bound (UB) and the Lower Bound (LB) computed for that item. +//! Specifically, `(UB - LB) ≤ W * epsilon`, where `W` denotes the sum of all item counts, and +//! `epsilon = 3.5/M`, where `M` is the `max_map_size`. +//! +//! This is the worst case guarantee that applies to arbitrary inputs. [^1] +//! For inputs typically seen in practice (`UB - LB`) is usually much smaller. +//! +//! [^1]: For speed we do employ some randomization that introduces a small probability that our +//! proof of the worst-case bound might not apply to a given run. However, we have ensured that this +//! probability is extremely small. For example, if the stream causes one table purge (rebuild), +//! our proof of the worst case bound applies with probability at least `1 - 1E-14`. If the stream +//! causes `1E9` purges, our proof applies with probability at least `1 - 1E-5`. +//! +//! # Background +//! +//! This code implements a variant of what is commonly known as the "Misra-Gries algorithm". +//! Variants of it were discovered and rediscovered and redesigned several times over the years: +//! * "Finding repeated elements", Misra, Gries, 1982 +//! * "Frequency estimation of Internet packet streams with limited space" Demaine, Lopez-Ortiz, +//! Munro, 2002 +//! * "A simple algorithm for finding frequent elements in streams and bags" Karp, Shenker, +//! Papadimitriou, 2003 +//! * "Efficient Computation of Frequent and Top-k Elements in Data Streams" Metwally, Agrawal, +//! Abbadi, 2006 +//! +//! # Examples +//! +//! ``` //! # use datasketches::frequencies::ErrorType; //! # use datasketches::frequencies::FrequentItemsSketch; //! let mut sketch = FrequentItemsSketch::::new(64); @@ -38,7 +88,7 @@ //! //! # Serialization //! -//! ```rust +//! ``` //! # use datasketches::frequencies::FrequentItemsSketch; //! let mut sketch = FrequentItemsSketch::::new(64); //! sketch.update_with_count(42, 2); @@ -52,6 +102,7 @@ mod reverse_purge_item_hash_map; mod serialization; mod sketch; +pub use self::serialization::FrequentItemValue; pub use self::sketch::ErrorType; pub use self::sketch::FrequentItemsSketch; pub use self::sketch::Row; diff --git a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs index f934b87..79ed290 100644 --- a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs +++ b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs @@ -192,7 +192,7 @@ impl ReversePurgeItemHashMap { T: Clone, { if self.num_active == 0 { - return Vec::new(); + return vec![]; } let mut keys = Vec::with_capacity(self.num_active); for i in 0..self.keys.len() { @@ -208,7 +208,7 @@ impl ReversePurgeItemHashMap { /// Returns the active values in the map. pub fn active_values(&self) -> Vec { if self.num_active == 0 { - return Vec::new(); + return vec![]; } let mut values = Vec::with_capacity(self.num_active); for i in 0..self.values.len() { diff --git a/datasketches/src/frequencies/sketch.rs b/datasketches/src/frequencies/sketch.rs index 13c79f9..83de1cf 100644 --- a/datasketches/src/frequencies/sketch.rs +++ b/datasketches/src/frequencies/sketch.rs @@ -85,7 +85,7 @@ impl Row { /// The sketch tracks approximate item frequencies and can return estimates with /// guaranteed upper and lower bounds. /// -/// See [`crate::frequencies`] for an overview and error guarantees. +/// See the [module level documentation](super) for an overview and error guarantees. #[derive(Debug, Clone)] pub struct FrequentItemsSketch { lg_max_map_size: u8, @@ -296,7 +296,7 @@ impl FrequentItemsSketch { /// Returns frequent items using the sketch maximum error as threshold. /// - /// This is equivalent to `frequent_items_with_threshold(self.maximum_error(), error_type)`. + /// This is equivalent to `frequent_items_with_threshold(error_type, self.maximum_error())`. /// /// # Examples /// @@ -343,7 +343,7 @@ impl FrequentItemsSketch { T: Clone, { let threshold = threshold.max(self.offset); - let mut rows = Vec::new(); + let mut rows = vec![]; for (item, count) in self.hash_map.iter() { let lower = count; let upper = count + self.offset; @@ -510,74 +510,36 @@ impl FrequentItemsSketch { } } -impl FrequentItemsSketch { +impl FrequentItemsSketch { /// Serializes this sketch into a byte vector. /// /// # Examples /// - /// ``` - /// # use datasketches::frequencies::FrequentItemsSketch; - /// # let mut sketch = FrequentItemsSketch::::new(64); - /// # sketch.update_with_count(7, 2); - /// let bytes = sketch.serialize(); - /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); - /// assert!(decoded.estimate(&7) >= 2); - /// ``` - pub fn serialize(&self) -> Vec { - self.serialize_inner( - |items| items.iter().map(i64::serialize_size).sum(), - |bytes, items| { - for item in items { - item.serialize_value(bytes); - } - }, - ) - } - - /// Deserializes a sketch from bytes. - /// - /// # Examples + /// Built-in support for `i64`: /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; /// # let mut sketch = FrequentItemsSketch::::new(64); /// # sketch.update_with_count(7, 2); - /// # let bytes = sketch.serialize(); + /// let bytes = sketch.serialize(); /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); /// assert!(decoded.estimate(&7) >= 2); /// ``` - pub fn deserialize(bytes: &[u8]) -> Result { - Self::deserialize_inner(bytes, |mut cursor, num_items| { - let mut items = Vec::with_capacity(num_items); - for i in 0..num_items { - let item = i64::deserialize_value(&mut cursor).map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} items, failed to read item at index {i}" - )) - })?; - items.push(item); - } - Ok(items) - }) - } -} - -impl FrequentItemsSketch { - /// Serializes this sketch into a byte vector. /// - /// # Examples + /// Built-in support for `String`: /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; - /// # let mut sketch = FrequentItemsSketch::::new(64); - /// # sketch.update_with_count(7, 2); + /// # let mut sketch = FrequentItemsSketch::::new(64); + /// # let apple = "apple".to_string(); + /// # sketch.update_with_count(apple.clone(), 2); /// let bytes = sketch.serialize(); - /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); - /// assert!(decoded.estimate(&7) >= 2); + /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); + /// assert!(decoded.estimate(&apple) >= 2); /// ``` pub fn serialize(&self) -> Vec { self.serialize_inner( - |items| items.iter().map(u64::serialize_size).sum(), + |items| items.iter().map(T::serialize_size).sum(), |bytes, items| { for item in items { item.serialize_value(bytes); @@ -590,58 +552,18 @@ impl FrequentItemsSketch { /// /// # Examples /// + /// Built-in support for `i64`: + /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; - /// # let mut sketch = FrequentItemsSketch::::new(64); + /// # let mut sketch = FrequentItemsSketch::::new(64); /// # sketch.update_with_count(7, 2); /// # let bytes = sketch.serialize(); - /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); + /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); /// assert!(decoded.estimate(&7) >= 2); /// ``` - pub fn deserialize(bytes: &[u8]) -> Result { - Self::deserialize_inner(bytes, |mut cursor, num_items| { - let mut items = Vec::with_capacity(num_items); - for i in 0..num_items { - let item = u64::deserialize_value(&mut cursor).map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} items, failed to read item at index {i}" - )) - })?; - items.push(item); - } - Ok(items) - }) - } -} - -impl FrequentItemsSketch { - /// Serializes this sketch into a byte vector. - /// - /// # Examples - /// - /// ``` - /// # use datasketches::frequencies::FrequentItemsSketch; - /// # let mut sketch = FrequentItemsSketch::::new(64); - /// # let apple = "apple".to_string(); - /// # sketch.update_with_count(apple.clone(), 2); - /// let bytes = sketch.serialize(); - /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); - /// assert!(decoded.estimate(&apple) >= 2); - /// ``` - pub fn serialize(&self) -> Vec { - self.serialize_inner( - |items| items.iter().map(String::serialize_size).sum(), - |bytes, items| { - for item in items { - item.serialize_value(bytes); - } - }, - ) - } - - /// Deserializes a sketch from bytes. /// - /// # Examples + /// Built-in support for `String`: /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; @@ -656,7 +578,7 @@ impl FrequentItemsSketch { Self::deserialize_inner(bytes, |mut cursor, num_items| { let mut items = Vec::with_capacity(num_items); for i in 0..num_items { - let item = String::deserialize_value(&mut cursor).map_err(|_| { + let item = T::deserialize_value(&mut cursor).map_err(|_| { Error::insufficient_data(format!( "expected {num_items} items, failed to read item at index {i}" )) diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs index 87eaf22..99d2cca 100644 --- a/datasketches/src/hash/mod.rs +++ b/datasketches/src/hash/mod.rs @@ -19,7 +19,6 @@ mod murmurhash; mod xxhash; pub(crate) use self::murmurhash::MurmurHash3X64128; -#[allow(unused_imports)] pub(crate) use self::xxhash::XxHash64; /// The seed 9001 used in the sketch update methods is a prime number that was chosen very early diff --git a/datasketches/src/hll/array4.rs b/datasketches/src/hll/array4.rs index a17b4da..073c335 100644 --- a/datasketches/src/hll/array4.rs +++ b/datasketches/src/hll/array4.rs @@ -79,8 +79,8 @@ impl Array4 { /// Get the actual value at a slot (adjusted for cur_min and aux_map) /// /// Returns the true register value: - /// - If raw < 15: value = cur_min + raw - /// - If raw == 15 (AUX_TOKEN): value is in aux_map + /// * If raw < 15: value = cur_min + raw + /// * If raw == 15 (AUX_TOKEN): value is in aux_map pub(super) fn get(&self, slot: u32) -> u8 { let raw = self.get_raw(slot); diff --git a/datasketches/src/hll/array8.rs b/datasketches/src/hll/array8.rs index 00faf16..2bd1509 100644 --- a/datasketches/src/hll/array8.rs +++ b/datasketches/src/hll/array8.rs @@ -187,8 +187,8 @@ impl Array8 { /// /// # Parameters /// - /// * `src` - Source register values (length must be 2^src_lg_k) - /// * `src_lg_k` - Log2 of source register count + /// * `src`: Source register values (length must be 2^src_lg_k) + /// * `src_lg_k`: Log2 of source register count /// /// # Panics /// diff --git a/datasketches/src/hll/estimator.rs b/datasketches/src/hll/estimator.rs index a9bd63b..7c9ca08 100644 --- a/datasketches/src/hll/estimator.rs +++ b/datasketches/src/hll/estimator.rs @@ -33,9 +33,9 @@ use crate::hll::harmonic_numbers; /// allowing it to be composed into Array4, Array6, and Array8. /// /// The estimator supports two modes: -/// - **In-order mode**: Uses HIP (Historical Inverse Probability) accumulator for accurate +/// * **In-order mode**: Uses HIP (Historical Inverse Probability) accumulator for accurate /// sequential updates -/// - **Out-of-order mode**: Uses composite estimator (raw HLL + linear counting) after +/// * **Out-of-order mode**: Uses composite estimator (raw HLL + linear counting) after /// deserialization or merging #[derive(Debug, Clone, PartialEq)] pub struct HipEstimator { @@ -71,8 +71,8 @@ impl HipEstimator { /// 2. Update KxQ registers (always) /// /// The KxQ registers are split for numerical precision: - /// - kxq0: sum of 1/2^v for v < 32 - /// - kxq1: sum of 1/2^v for v >= 32 + /// * kxq0: sum of 1/2^v for v < 32 + /// * kxq1: sum of 1/2^v for v >= 32 pub fn update(&mut self, lg_config_k: u8, old_value: u8, new_value: u8) { let k = (1 << lg_config_k) as f64; @@ -109,9 +109,9 @@ impl HipEstimator { /// /// # Arguments /// - /// * `lg_config_k` - Log2 of number of registers (k) - /// * `cur_min` - Current minimum register value (for Array4, 0 for Array6/8) - /// * `num_at_cur_min` - Number of registers at cur_min value + /// * `lg_config_k`: Log2 of number of registers (k) + /// * `cur_min`: Current minimum register value (for Array4, 0 for Array6/8) + /// * `num_at_cur_min`: Number of registers at cur_min value pub fn estimate(&self, lg_config_k: u8, cur_min: u8, num_at_cur_min: u32) -> f64 { if self.out_of_order { self.get_composite_estimate(lg_config_k, cur_min, num_at_cur_min) @@ -126,10 +126,10 @@ impl HipEstimator { /// /// # Arguments /// - /// * `lg_config_k` - Log2 of number of registers (k) - /// * `cur_min` - Current minimum register value (for Array4, 0 for Array6/8) - /// * `num_at_cur_min` - Number of registers at cur_min value - /// * `num_std_dev` - Number of standard deviations (1, 2, or 3) + /// * `lg_config_k`: Log2 of number of registers (k) + /// * `cur_min`: Current minimum register value (for Array4, 0 for Array6/8) + /// * `num_at_cur_min`: Number of registers at cur_min value + /// * `num_std_dev`: Number of standard deviations (1, 2, or 3) pub fn upper_bound( &self, lg_config_k: u8, @@ -149,10 +149,10 @@ impl HipEstimator { /// /// # Arguments /// - /// * `lg_config_k` - Log2 of number of registers (k) - /// * `cur_min` - Current minimum register value (for Array4, 0 for Array6/8) - /// * `num_at_cur_min` - Number of registers at cur_min value - /// * `num_std_dev` - Number of standard deviations (1, 2, or 3) + /// * `lg_config_k`: Log2 of number of registers (k) + /// * `cur_min`: Current minimum register value (for Array4, 0 for Array6/8) + /// * `num_at_cur_min`: Number of registers at cur_min value + /// * `num_std_dev`: Number of standard deviations (1, 2, or 3) pub fn lower_bound( &self, lg_config_k: u8, @@ -286,8 +286,8 @@ impl HipEstimator { /// Set the out-of-order flag /// /// This should be set to true when: - /// - Deserializing a sketch from bytes - /// - After a merge/union operation + /// * Deserializing a sketch from bytes + /// * After a merge/union operation pub fn set_out_of_order(&mut self, ooo: bool) { self.out_of_order = ooo; if ooo { @@ -331,10 +331,10 @@ fn inv_pow2(value: u8) -> f64 { /// /// # Arguments /// -/// * `lg_config_k` - Log2 of number of registers (must be 4-21) -/// * `upper_bound` - Whether computing upper bound (vs lower bound) -/// * `ooo` - Whether sketch is out-of-order (merged/deserialized) -/// * `num_std_dev` - Number of standard deviations (1, 2, or 3) +/// * `lg_config_k`: Log2 of number of registers (must be 4-21) +/// * `upper_bound`: Whether computing upper bound (vs lower bound) +/// * `ooo`: Whether sketch is out-of-order (merged/deserialized) +/// * `num_std_dev`: Number of standard deviations (1, 2, or 3) /// /// # Returns /// @@ -357,7 +357,7 @@ fn get_rel_err(lg_config_k: u8, upper_bound: bool, ooo: bool, num_std_dev: NumSt return sign * (num_std_dev as u8 as f64) * rse_factor / k.sqrt(); } - // For lg_k <= 12, use empirically measured lookup tables + // For lg_k <= 12, use empirically measured lookup tables. // Tables are indexed by: ((lg_k - 4) * 3) + (num_std_dev - 1) let idx = ((lg_config_k as usize) - 4) * 3 + ((num_std_dev as usize) - 1); diff --git a/datasketches/src/hll/harmonic_numbers.rs b/datasketches/src/hll/harmonic_numbers.rs index cdc4161..dea7141 100644 --- a/datasketches/src/hll/harmonic_numbers.rs +++ b/datasketches/src/hll/harmonic_numbers.rs @@ -86,8 +86,8 @@ fn harmonic_number(n: usize) -> f64 { /// /// # Arguments /// -/// * `bit_vector_length` - Total length of bit vector (k for HLL) -/// * `num_bits_set` - Number of bits set (non-zero registers) +/// * `bit_vector_length`: Total length of bit vector (k for HLL) +/// * `num_bits_set`: Number of bits set (non-zero registers) /// /// # Returns /// diff --git a/datasketches/src/hll/mod.rs b/datasketches/src/hll/mod.rs index f9476fe..6f99a49 100644 --- a/datasketches/src/hll/mod.rs +++ b/datasketches/src/hll/mod.rs @@ -26,9 +26,9 @@ //! This implementation follows the Apache DataSketches specification and supports multiple //! storage modes that automatically adapt based on cardinality: //! -//! - **List mode**: Stores individual values for small cardinalities -//! - **Set mode**: Uses a hash set for medium cardinalities -//! - **HLL mode**: Uses compact arrays for large cardinalities +//! * **List mode**: Stores individual values for small cardinalities +//! * **Set mode**: Uses a hash set for medium cardinalities +//! * **HLL mode**: Uses compact arrays for large cardinalities //! //! Mode transitions are automatic and transparent to the user. Each promotion preserves //! all previously observed values and maintains estimation accuracy. @@ -44,9 +44,9 @@ //! //! Three target HLL types are supported, trading precision for memory: //! -//! - [`HllType::Hll4`]: 4 bits per bucket (most compact) -//! - [`HllType::Hll6`]: 6 bits per bucket (balanced) -//! - [`HllType::Hll8`]: 8 bits per bucket (highest precision) +//! * [`HllType::Hll4`]: 4 bits per bucket (most compact) +//! * [`HllType::Hll6`]: 6 bits per bucket (balanced) +//! * [`HllType::Hll8`]: 8 bits per bucket (highest precision) //! //! # Union Operations //! @@ -54,9 +54,9 @@ //! It maintains an internal "gadget" sketch that accumulates the union of all input sketches //! and automatically handles: //! -//! - Sketches with different `lg_k` precision levels (resizes/downsamples as needed) -//! - Sketches in different modes (List, Set, or Array) -//! - Sketches with different target HLL types +//! * Sketches with different `lg_k` precision levels (resizes/downsamples as needed) +//! * Sketches in different modes (List, Set, or Array) +//! * Sketches with different target HLL types //! //! The union operation preserves cardinality estimation accuracy while enabling distributed //! computation patterns where sketches are built independently and merged later. @@ -64,10 +64,10 @@ //! # Serialization //! //! Sketches can be serialized and deserialized while preserving all state, including: -//! - Current mode and HLL type -//! - All observed values (coupons or register values) -//! - HIP accumulator state for accurate estimation -//! - Out-of-order flag for merged/deserialized sketches +//! * Current mode and HLL type +//! * All observed values (coupons or register values) +//! * HIP accumulator state for accurate estimation +//! * Out-of-order flag for merged/deserialized sketches //! //! The serialization format is compatible with Apache DataSketches implementations //! in Java and C++, enabling cross-platform sketch exchange. diff --git a/datasketches/src/hll/serialization.rs b/datasketches/src/hll/serialization.rs index 014b890..30740a9 100644 --- a/datasketches/src/hll/serialization.rs +++ b/datasketches/src/hll/serialization.rs @@ -64,8 +64,8 @@ pub fn extract_tgt_hll_type(mode_byte: u8) -> u8 { /// /// # Arguments /// -/// * `cur_mode` - 0 = LIST, 1 = SET, 2 = HLL -/// * `tgt_type` - 0 = HLL4, 1 = HLL6, 2 = HLL8 +/// * `cur_mode`: 0 = LIST, 1 = SET, 2 = HLL +/// * `tgt_type`: 0 = HLL4, 1 = HLL6, 2 = HLL8 #[inline] pub fn encode_mode_byte(cur_mode: u8, tgt_type: u8) -> u8 { (cur_mode & 0x3) | ((tgt_type & 0x3) << 2) diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs index 484e16a..ecf3ff1 100644 --- a/datasketches/src/hll/sketch.rs +++ b/datasketches/src/hll/sketch.rs @@ -54,15 +54,15 @@ impl HllSketch { /// /// # Arguments /// - /// * `lg_config_k` - Log2 of the number of buckets (K). Must be in [4, 21]. - /// - lg_k=4: 16 buckets, ~26% relative error - /// - lg_k=12: 4096 buckets, ~1.6% relative error (common choice) - /// - lg_k=21: 2M buckets, ~0.4% relative error - /// * `hll_type` - Target HLL array type (Hll4, Hll6, or Hll8) + /// * `lg_config_k`: Log2 of the number of buckets (K). Must be in `[4, 21]`. + /// * lg_k=4: 16 buckets, ~26% relative error + /// * lg_k=12: 4096 buckets, ~1.6% relative error (common choice) + /// * lg_k=21: 2M buckets, ~0.4% relative error + /// * `hll_type`: Target HLL array type (Hll4, Hll6, or Hll8) /// /// # Panics /// - /// If lg_config_k is not in range [4, 21] + /// If lg_config_k is not in range `[4, 21]` /// /// # Examples /// @@ -94,8 +94,8 @@ impl HllSketch { /// /// # Arguments /// - /// * `lg_config_k` - Log2 of the number of buckets (K) - /// * `mode` - The mode to initialize the sketch with + /// * `lg_config_k`: Log2 of the number of buckets (K) + /// * `mode`: The mode to initialize the sketch with pub(super) fn from_mode(lg_config_k: u8, mode: Mode) -> Self { Self { lg_config_k, mode } } diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs index 03fb4ea..5f3929d 100644 --- a/datasketches/src/hll/union.rs +++ b/datasketches/src/hll/union.rs @@ -24,9 +24,9 @@ //! //! The union maintains an internal "gadget" sketch that accumulates the union //! of all input sketches. It can handle sketches with: -//! - Different lg_k values (automatically resizes as needed) -//! - Different modes (List, Set, Array4/6/8) -//! - Different target HLL types +//! * Different lg_k values (automatically resizes as needed) +//! * Different modes (List, Set, Array4/6/8) +//! * Different target HLL types use std::hash::Hash; @@ -59,13 +59,13 @@ impl HllUnion { /// /// # Arguments /// - /// * `lg_max_k` - Maximum log2 of the number of buckets. Must be in [4, 21]. This determines + /// * `lg_max_k`: Maximum log2 of the number of buckets. Must be in `[4, 21]`. This determines /// the maximum precision the union can handle. Input sketches with larger lg_k will be /// down-sampled. /// /// # Panics /// - /// Panics if `lg_max_k` is not in the range [4, 21]. + /// Panics if `lg_max_k` is not in the range `[4, 21]`. /// /// # Examples /// @@ -110,9 +110,9 @@ impl HllUnion { /// Update the union with another sketch /// /// Merges the input sketch into the union's internal gadget, handling: - /// - Sketches with different lg_k values (resizes/downsamples as needed) - /// - Sketches in different modes (List, Set, Array4/6/8) - /// - Sketches with different target HLL types + /// * Sketches with different lg_k values (resizes/downsamples as needed) + /// * Sketches in different modes (List, Set, Array4/6/8) + /// * Sketches with different target HLL types /// /// # Examples /// @@ -244,7 +244,7 @@ impl HllUnion { /// /// # Arguments /// - /// * `hll_type` - The target HLL type for the result sketch (Hll4, Hll6, or Hll8) + /// * `hll_type`: The target HLL type for the result sketch (Hll4, Hll6, or Hll8) /// /// # Examples /// @@ -401,9 +401,9 @@ fn merge_coupons_into_mode(dst: &mut Array8, src_mode: &Mode) { /// Merge an HLL array into an Array8 /// /// Handles merging from Array4, Array6, or Array8 sources. Dispatches based on lg_k: -/// - Same lg_k: optimized bulk merge -/// - src lg_k > dst lg_k: downsample src into dst -/// - src lg_k < dst lg_k: handled by caller (requires gadget replacement) +/// * Same lg_k: optimized bulk merge +/// * src lg_k > dst lg_k: downsample src into dst +/// * src lg_k < dst lg_k: handled by caller (requires gadget replacement) fn merge_array_into_array8(dst_array8: &mut Array8, dst_lg_k: u8, src_mode: &Mode, src_lg_k: u8) { assert!( src_lg_k >= dst_lg_k, diff --git a/datasketches/src/theta/bit_pack.rs b/datasketches/src/theta/bit_pack.rs index 031afce..2a59351 100644 --- a/datasketches/src/theta/bit_pack.rs +++ b/datasketches/src/theta/bit_pack.rs @@ -4972,9 +4972,9 @@ fn unpack_bits_63(values: &mut [u64], bytes: &[u8]) { /// /// # Panics /// -/// - Panics if `values.len()` is not equal to `BLOCK_WIDTH`. -/// - Panics if `bits` is not in the range `1..=63`. -/// - Panics if `bytes.len()` is less than `bits * BLOCK_WIDTH`. +/// * Panics if `values.len()` is not equal to `BLOCK_WIDTH`. +/// * Panics if `bits` is not in the range `1..=63`. +/// * Panics if `bytes.len()` is less than `bits * BLOCK_WIDTH`. pub(crate) fn pack_bits_block(values: &[u64], bytes: &mut [u8], bits: u8) { assert_eq!(values.len(), BLOCK_WIDTH, "values length must be 8"); assert!( @@ -5058,9 +5058,9 @@ pub(crate) fn pack_bits_block(values: &[u64], bytes: &mut [u8], bits: u8) { /// /// # Panics /// -/// - Panics if `values.len()` is not equal to `BLOCK_WIDTH`. -/// - Panics if `bits` is not in the range `1..=63`. -/// - Panics if `bytes.len()` is less than `bits * BLOCK_WIDTH`. +/// * Panics if `values.len()` is not equal to `BLOCK_WIDTH`. +/// * Panics if `bits` is not in the range `1..=63`. +/// * Panics if `bytes.len()` is less than `bits * BLOCK_WIDTH`. pub(crate) fn unpack_bits_block(values: &mut [u64], bytes: &[u8], bits: u8) { assert_eq!(values.len(), BLOCK_WIDTH, "values length must be 8"); assert!( diff --git a/datasketches/src/theta/hash_table.rs b/datasketches/src/theta/hash_table.rs index d77304e..b06c695 100644 --- a/datasketches/src/theta/hash_table.rs +++ b/datasketches/src/theta/hash_table.rs @@ -587,7 +587,7 @@ mod tests { let mut table = ThetaHashTable::new(8, ResizeFactor::X8, 1.0, DEFAULT_UPDATE_SEED); // Insert some values - let mut inserted_hashes = Vec::new(); + let mut inserted_hashes = vec![]; for i in 0..10 { let hash = table.hash_and_screen(format!("value_{}", i)); if hash != 0 && table.try_insert(hash) { @@ -633,7 +633,7 @@ mod tests { // Insert many values to trigger rebuild let mut i = 0; - let mut inserted_hashes = Vec::new(); + let mut inserted_hashes = vec![]; loop { let hash = table.hash_and_screen(format!("value_{}", i)); i += 1; diff --git a/datasketches/src/theta/mod.rs b/datasketches/src/theta/mod.rs index 1d33a71..fdde037 100644 --- a/datasketches/src/theta/mod.rs +++ b/datasketches/src/theta/mod.rs @@ -27,8 +27,8 @@ //! Theta sketches provide approximate distinct count (cardinality) estimation with //! configurable accuracy and memory usage. The implementation supports: //! -//! - **ThetaSketch**: Mutable sketch for building from input data -//! - **CompactThetaSketch**: Immutable sketch with compact memory layout +//! * **ThetaSketch**: Mutable sketch for building from input data +//! * **CompactThetaSketch**: Immutable sketch with compact memory layout //! //! # Usage //! diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs index a56d3c4..32f6e9a 100644 --- a/datasketches/src/theta/sketch.rs +++ b/datasketches/src/theta/sketch.rs @@ -236,7 +236,7 @@ impl ThetaSketch { /// /// # Arguments /// - /// * `num_std_dev` - The number of standard deviations for confidence bounds. + /// * `num_std_dev`: The number of standard deviations for confidence bounds. /// /// # Examples /// @@ -270,7 +270,7 @@ impl ThetaSketch { /// /// # Arguments /// - /// * `num_std_dev` - The number of standard deviations for confidence bounds. + /// * `num_std_dev`: The number of standard deviations for confidence bounds. /// /// # Examples /// diff --git a/datasketches/tests/bloom_serialization_test.rs b/datasketches/tests/bloom_serialization_test.rs index 5370f89..15daba2 100644 --- a/datasketches/tests/bloom_serialization_test.rs +++ b/datasketches/tests/bloom_serialization_test.rs @@ -15,15 +15,6 @@ // specific language governing permissions and limitations // under the License. -//! Bloom Filter Serialization Compatibility Tests -//! -//! These tests verify binary compatibility with Apache DataSketches implementations: -//! - Java (datasketches-java) -//! - C++ (datasketches-cpp) -//! -//! Test data is generated by the reference implementations and stored in: -//! `tests/serialization_test_data/` - mod common; use std::fs; diff --git a/datasketches/tests/hll_serialization_test.rs b/datasketches/tests/hll_serialization_test.rs index 9c8200f..a7e00e6 100644 --- a/datasketches/tests/hll_serialization_test.rs +++ b/datasketches/tests/hll_serialization_test.rs @@ -15,15 +15,6 @@ // specific language governing permissions and limitations // under the License. -//! HLL Sketch Serialization Compatibility Tests -//! -//! These tests verify binary compatibility with Apache DataSketches implementations: -//! - Java (datasketches-java) -//! - C++ (datasketches-cpp) -//! -//! Test data is generated by the reference implementations and stored in: -//! `tests/serialization_test_data/` - mod common; use std::fs; @@ -48,9 +39,9 @@ fn test_sketch_file(path: PathBuf, expected_cardinality: usize, expected_lg_k: u // Check cardinality estimate with error bounds // For lg_k=12, theoretical RSE ≈ 1.625%, but we use 2% margin to account for: - // - Small sample sizes (especially n < 100) - // - Out-of-order mode (composite estimator) - // - Variation across implementations + // * Small sample sizes (especially n < 100) + // * Out-of-order mode (composite estimator) + // * Variation across implementations if expected > 0.0 { let error_margin = 0.02; // 2% error margin let lower_bound = expected * (1.0 - error_margin); diff --git a/datasketches/tests/hll_union_test.rs b/datasketches/tests/hll_union_test.rs index 2f17a29..91080bf 100644 --- a/datasketches/tests/hll_union_test.rs +++ b/datasketches/tests/hll_union_test.rs @@ -18,12 +18,12 @@ //! HyperLogLog Union Integration Tests //! //! These tests verify the public API behavior of HllUnion, focusing on: -//! - Basic union operations -//! - Mode transitions and mixed-mode unions -//! - Different HLL types and lg_k values -//! - Bounds and statistical properties -//! - Mathematical properties (commutativity, associativity, idempotency) -//! - Reset and reuse patterns +//! * Basic union operations +//! * Mode transitions and mixed-mode unions +//! * Different HLL types and lg_k values +//! * Bounds and statistical properties +//! * Mathematical properties (commutativity, associativity, idempotency) +//! * Reset and reuse patterns //! //! This mirrors the testing strategy used in hll_update_test.rs