apache · tisonkun · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,7 +13,8 @@ All significant changes to this project will be documented in this file.
 
 * `CountMinSketch` with unsigned values now supports `halve` and `decay` operations.
 * `CpcSketch` and `CpcUnion` are now available for cardinality estimation.
-* `FrequentItemsSketch` now supports serde for `u64` value.
+* `FrequentItemsSketch` now supports serde for any value implement `FrequentItemValue` (builtin supports for `i64`, `u64`, and `String`).
+* Expose `codec::SketchBytes`, `codec::SketchSlice`, and `FrequentItemValue` as public API.
 
 ## v0.2.0 (2026-01-14)
 

diff --git a/datasketches/src/bloom/builder.rs b/datasketches/src/bloom/builder.rs
@@ -22,9 +22,9 @@ use crate::hash::DEFAULT_UPDATE_SEED;
 /// Builder for creating [`BloomFilter`] instances.
 ///
 /// Provides two construction modes:
-/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
+/// * [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
 ///   (recommended)
-/// - [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
+/// * [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
 #[derive(Debug, Clone)]
 pub struct BloomFilterBuilder {
     num_bits: u64,
@@ -52,8 +52,8 @@ impl BloomFilterBuilder {
     ///
     /// # Arguments
     ///
-    /// - `max_items`: Maximum expected number of distinct items
-    /// - `fpp`: Target false positive probability (e.g., 0.01 for 1%)
+    /// * `max_items`: Maximum expected number of distinct items
+    /// * `fpp`: Target false positive probability (e.g., 0.01 for 1%)
     ///
     /// # Panics
     ///
@@ -95,14 +95,14 @@ impl BloomFilterBuilder {
     ///
     /// # Arguments
     ///
-    /// - `num_bits`: Total number of bits in the filter
-    /// - `num_hashes`: Number of hash functions to use
+    /// * `num_bits`: Total number of bits in the filter
+    /// * `num_hashes`: Number of hash functions to use
     ///
     /// # Panics
     ///
     /// Panics if any of:
-    /// - `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`]
-    /// - `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MIN_NUM_HASHES`]
+    /// * `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > [`Self::MAX_NUM_BITS`]
+    /// * `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > [`Self::MAX_NUM_HASHES`]
     ///
     /// # Examples
     ///

diff --git a/datasketches/src/bloom/mod.rs b/datasketches/src/bloom/mod.rs
@@ -23,10 +23,10 @@
 //!
 //! # Properties
 //!
-//! - **No false negatives**: If an item was inserted, `contains()` will always return `true`
-//! - **Possible false positives**: `contains()` may return `true` for items never inserted
-//! - **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically
-//! - **Linear space**: Size is proportional to the expected number of distinct items
+//! * **No false negatives**: If an item was inserted, `contains()` will always return `true`
+//! * **Possible false positives**: `contains()` may return `true` for items never inserted
+//! * **Fixed size**: Unlike typical sketches, Bloom filters do not resize automatically
+//! * **Linear space**: Size is proportional to the expected number of distinct items
 //!
 //! # Usage
 //!
@@ -109,15 +109,15 @@
 //!
 //! # Implementation Details
 //!
-//! - Uses XXHash64 for hashing
-//! - Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions
-//! - Bits packed efficiently in `u64` words
-//! - Compatible serialization format (family ID: 21)
+//! * Uses XXHash64 for hashing
+//! * Implements double hashing (Kirsch-Mitzenmacher method) for k hash functions
+//! * Bits packed efficiently in `u64` words
+//! * Compatible serialization format (family ID: 21)
 //!
 //! # References
 //!
-//! - Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors"
-//! - Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom
+//! * Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with allowable errors"
+//! * Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: Building a Better Bloom
 //!   Filter"
 
 mod builder;

diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs
@@ -33,9 +33,9 @@ const EMPTY_FLAG_MASK: u8 = 1 << 2;
 /// A Bloom filter for probabilistic set membership testing.
 ///
 /// Provides fast membership queries with:
-/// - No false negatives (inserted items always return `true`)
-/// - Tunable false positive rate
-/// - Constant space usage
+/// * No false negatives (inserted items always return `true`)
+/// * Tunable false positive rate
+/// * Constant space usage
 ///
 /// Use [`super::BloomFilterBuilder`] to construct instances.
 #[derive(Debug, Clone, PartialEq)]
@@ -54,8 +54,8 @@ impl BloomFilter {
     /// Tests whether an item is possibly in the set.
     ///
     /// Returns:
-    /// - `true`: Item was **possibly** inserted (or false positive)
-    /// - `false`: Item was **definitely not** inserted
+    /// * `true`: Item was **possibly** inserted (or false positive)
+    /// * `false`: Item was **definitely not** inserted
     ///
     /// # Examples
     ///
@@ -290,8 +290,8 @@ impl BloomFilter {
     ///
     /// Uses the approximation: `load_factor^k`
     /// where:
-    /// - load_factor = fraction of bits set (bits_used / capacity)
-    /// - k = num_hashes
+    /// * load_factor = fraction of bits set (bits_used / capacity)
+    /// * k = num_hashes
     ///
     /// This assumes uniform bit distribution and is more accurate than
     /// trying to estimate insertion count from the load factor.
@@ -307,9 +307,9 @@ impl BloomFilter {
     /// Checks if two filters are compatible for merging.
     ///
     /// Filters are compatible if they have the same:
-    /// - Capacity (number of bits)
-    /// - Number of hash functions
-    /// - Seed
+    /// * Capacity (number of bits)
+    /// * Number of hash functions
+    /// * Seed
     pub fn is_compatible(&self, other: &Self) -> bool {
         self.bit_array.len() == other.bit_array.len()
             && self.num_hashes == other.num_hashes
@@ -379,9 +379,9 @@ impl BloomFilter {
     /// # Errors
     ///
     /// Returns an error if:
-    /// - The data is truncated or corrupted
-    /// - The family ID doesn't match (not a Bloom filter)
-    /// - The serial version is unsupported
+    /// * The data is truncated or corrupted
+    /// * The family ID doesn't match (not a Bloom filter)
+    /// * The serial version is unsupported
     ///
     /// # Examples
     ///
@@ -501,8 +501,8 @@ impl BloomFilter {
     /// Computes the two base hash values using XXHash64.
     ///
     /// Uses a two-hash approach:
-    /// - h0 = XXHash64(item, seed)
-    /// - h1 = XXHash64(item, h0)
+    /// * h0 = XXHash64(item, seed)
+    /// * h1 = XXHash64(item, h0)
     fn compute_hash<T: Hash>(&self, item: &T) -> (u64, u64) {
         // First hash with the configured seed
         let mut hasher = XxHash64::with_seed(self.seed);

diff --git a/datasketches/src/common/binomial_bounds.rs b/datasketches/src/common/binomial_bounds.rs
@@ -274,9 +274,9 @@ static UB_EQUIV_TABLE: [f64; 363] = [
 ///
 /// # Arguments
 ///
-/// * `num_samples` - The number of samples in the sample set.
-/// * `theta` - The sampling probability. Must be in the range (0.0, 1.0].
-/// * `num_std_dev` - The number of standard deviations for confidence bounds.
+/// * `num_samples`: The number of samples in the sample set.
+/// * `theta`: The sampling probability. Must be in the range (0.0, 1.0].
+/// * `num_std_dev`: The number of standard deviations for confidence bounds.
 ///
 /// # Returns
 ///
@@ -301,11 +301,11 @@ pub(crate) fn lower_bound(
 ///
 /// # Arguments
 ///
-/// * `num_samples` - The number of samples in the sample set.
-/// * `theta` - The sampling probability. Must be in the range `(0.0, 1.0]`.
-/// * `num_std_dev` - The number of standard deviations for confidence bounds.
-/// * `no_data_seen` - This is normally false. However, in the case where you have zero samples and
-///   a theta < 1.0, this flag enables the distinction between a virgin case when no actual data has
+/// * `num_samples`: The number of samples in the sample set.
+/// * `theta`: The sampling probability. Must be in the range `(0.0, 1.0]`.
+/// * `num_std_dev`: The number of standard deviations for confidence bounds.
+/// * `no_data_seen`: This is normally false. However, in the case where you have zero samples and a
+///   theta < 1.0, this flag enables the distinction between a virgin case when no actual data has
 ///   been seen and the case where the estimate may be zero but an upper error bound may still
 ///   exist.
 ///
@@ -367,16 +367,16 @@ fn cont_classic_ub(num_samples: u64, theta: f64, num_std_devs: f64) -> f64 {
 ///
 /// # Arguments
 ///
-/// * `num_samples` - The number of observed samples (k). Must be >= 1.
-/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
-/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
+/// * `num_samples`: The number of observed samples (k). Must be >= 1.
+/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
+/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
 ///
 /// # Invariants
 ///
-/// - `num_samples >= 1`
-/// - `0.0 < p < 1.0`
-/// - `0.0 < delta < 1.0`
-/// - `(num_samples / p) < 500.0` (enforced for performance and numerical stability)
+/// * `num_samples >= 1`
+/// * `0.0 < p < 1.0`
+/// * `0.0 < delta < 1.0`
+/// * `(num_samples / p) < 500.0` (enforced for performance and numerical stability)
 ///
 /// # Returns
 ///
@@ -413,15 +413,15 @@ fn special_n_star(num_samples: u64, p: f64, delta: f64) -> Result<u64, Error> {
 ///
 /// # Arguments
 ///
-/// * `num_samples` - The number of observed samples (k). Must be >= 1.
-/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
-/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
+/// * `num_samples`: The number of observed samples (k). Must be >= 1.
+/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
+/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
 ///
 /// # Invariants
 ///
-/// - `num_samples >= 1`
-/// - `0.0 < p < 1.0`
-/// - `0.0 < delta < 1.0`
+/// * `num_samples >= 1`
+/// * `0.0 < p < 1.0`
+/// * `0.0 < delta < 1.0`
 ///
 /// # Returns
 ///
@@ -452,14 +452,14 @@ fn special_n_prime_b(num_samples: u64, p: f64, delta: f64) -> Result<u64, Error>
 ///
 /// # Arguments
 ///
-/// * `num_samples` - The number of observed samples (k). Must be >= 1.
-/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
-/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
+/// * `num_samples`: The number of observed samples (k). Must be >= 1.
+/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
+/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
 ///
 /// # Invariants
 ///
-/// - `(num_samples / p) < 500.0` (enforced for performance)
-/// - A super-small delta could also make it slow.
+/// * `(num_samples / p) < 500.0` (enforced for performance)
+/// * A super-small delta could also make it slow.
 fn special_n_prime_f(num_samples: u64, p: f64, delta: f64) -> Result<u64, Error> {
     // Use a different algorithm if the following is true; this one will be too slow, or worse.
     if (num_samples as f64 / p) >= 500.0 {

diff --git a/datasketches/src/countmin/sketch.rs b/datasketches/src/countmin/sketch.rs
@@ -75,10 +75,10 @@ impl<T: CountMinValue> CountMinSketch<T> {
     /// # Panics
     ///
     /// Panics if any of:
-    /// - `num_hashes` is 0
-    /// - `num_buckets` is less than 3
-    /// - the total table size exceeds the supported limit
-    /// - the computed seed hash is zero
+    /// * `num_hashes` is 0
+    /// * `num_buckets` is less than 3
+    /// * the total table size exceeds the supported limit
+    /// * the computed seed hash is zero
     ///
     /// # Examples
     ///

diff --git a/datasketches/src/frequencies/mod.rs b/datasketches/src/frequencies/mod.rs
@@ -17,16 +17,66 @@
 
 //! Frequency sketches for finding heavy hitters in data streams.
 //!
-//! This module implements the Frequent Items sketch from Apache DataSketches. It tracks
-//! approximate frequencies in a stream and can report heavy hitters with explicit
-//! error guarantees (no false negatives or no false positives).
+//! # Overview
 //!
-//! For background, see the Java documentation:
-//! <https://apache.github.io/datasketches-java/9.0.0/org/apache/datasketches/frequencies/FrequentItemsSketch.html>
+//! This sketch is based on the paper ["A High-Performance Algorithm for Identifying Frequent Items
+//! in Data Streams"](https://arxiv.org/abs/1705.07001) by Daniel Anderson, Pryce Bevan, Kevin Lang,
+//! Edo Liberty, Lee Rhodes, and Justin Thaler.
 //!
-//! # Usage
+//! This sketch is useful for tracking approximate frequencies of items of type `T` that implements
+//! [`FrequentItemValue`], with optional associated counts (`T` item, `u64` count) that are members
+//! of a multiset of such items. The true frequency of an item is defined to be the sum of
+//! associated counts.
 //!
-//! ```rust
+//! This implementation provides the following capabilities:
+//! * Estimate the frequency of an item.
+//! * Return upper and lower bounds of any item, such that the true frequency is always between the
+//!   upper and lower bounds.
+//! * Return a global maximum error that holds for all items in the stream.
+//! * Return an array of frequent items that qualify either [`ErrorType::NoFalsePositives`] or
+//!   [`ErrorType::NoFalseNegatives`].
+//! * Merge itself with another sketch created from this module.
+//! * Serialize to bytes, or deserialize from bytes, for storage or transmission.
+//!
+//! # Accuracy
+//!
+//! If fewer than `0.75 * max_map_size` different items are inserted into the sketch the estimated
+//! frequencies returned by the sketch will be exact.
+//!
+//! The logic of the frequent items sketch is such that the stored counts and true counts are never
+//! too different. More specifically, for any item, the sketch can return an estimate of the true
+//! frequency of item, along with upper and lower bounds on the frequency (that hold
+//! deterministically).
+//!
+//! For this implementation and for a specific active item, it is guaranteed that the true frequency
+//! will be between the Upper Bound (UB) and the Lower Bound (LB) computed for that item.
+//! Specifically, `(UB - LB) ≤ W * epsilon`, where `W` denotes the sum of all item counts, and
+//! `epsilon = 3.5/M`, where `M` is the `max_map_size`.
+//!
+//! This is the worst case guarantee that applies to arbitrary inputs. [^1]
+//! For inputs typically seen in practice (`UB - LB`) is usually much smaller.
+//!
+//! [^1]: For speed we do employ some randomization that introduces a small probability that our
+//! proof of the worst-case bound might not apply to a given run. However, we have ensured that this
+//! probability is extremely small. For example, if the stream causes one table purge (rebuild),
+//! our proof of the worst case bound applies with probability at least `1 - 1E-14`. If the stream
+//! causes `1E9` purges, our proof applies with probability at least `1 - 1E-5`.
+//!
+//! # Background
+//!
+//! This code implements a variant of what is commonly known as the "Misra-Gries algorithm".
+//! Variants of it were discovered and rediscovered and redesigned several times over the years:
+//! * "Finding repeated elements", Misra, Gries, 1982
+//! * "Frequency estimation of Internet packet streams with limited space" Demaine, Lopez-Ortiz,
+//!   Munro, 2002
+//! * "A simple algorithm for finding frequent elements in streams and bags" Karp, Shenker,
+//!   Papadimitriou, 2003
+//! * "Efficient Computation of Frequent and Top-k Elements in Data Streams" Metwally, Agrawal,
+//!   Abbadi, 2006
+//!
+//! # Examples
+//!
+//! ```
 //! # use datasketches::frequencies::ErrorType;
 //! # use datasketches::frequencies::FrequentItemsSketch;
 //! let mut sketch = FrequentItemsSketch::<i64>::new(64);
@@ -38,7 +88,7 @@
 //!
 //! # Serialization
 //!
-//! ```rust
+//! ```
 //! # use datasketches::frequencies::FrequentItemsSketch;
 //! let mut sketch = FrequentItemsSketch::<i64>::new(64);
 //! sketch.update_with_count(42, 2);
@@ -52,6 +102,7 @@ mod reverse_purge_item_hash_map;
 mod serialization;
 mod sketch;
 
+pub use self::serialization::FrequentItemValue;
 pub use self::sketch::ErrorType;
 pub use self::sketch::FrequentItemsSketch;
 pub use self::sketch::Row;
diff --git a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
@@ -192,7 +192,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
         T: Clone,
     {
         if self.num_active == 0 {
-            return Vec::new();
+            return vec![];
         }
         let mut keys = Vec::with_capacity(self.num_active);
         for i in 0..self.keys.len() {
@@ -208,7 +208,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
     /// Returns the active values in the map.
     pub fn active_values(&self) -> Vec<u64> {
         if self.num_active == 0 {
-            return Vec::new();
+            return vec![];
         }
         let mut values = Vec::with_capacity(self.num_active);
         for i in 0..self.values.len() {