From ffe601a55fcda13d6862f537b1197a1eb25e8812 Mon Sep 17 00:00:00 2001 From: Adriano Meligrana <68152031+ameligrana@users.noreply.github.com> Date: Wed, 1 Apr 2026 00:16:10 +0200 Subject: [PATCH 1/2] Some formatting updates to the paper --- paper/ref.bib | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/paper/ref.bib b/paper/ref.bib index f5714bb..ec9e02f 100644 --- a/paper/ref.bib +++ b/paper/ref.bib @@ -37,7 +37,7 @@ @article{park2004reservoir pages = {750-762}, year = {2007}, issn = {0167-9473}, -doi = {https://doi.org/10.1016/j.csda.2007.03.010}, +doi = {10.1016/j.csda.2007.03.010}, url = {https://www.sciencedirect.com/science/article/pii/S0167947307001089}, author = {Byung-Hoon Park and George Ostrouchov and Nagiza F. Samatova}, keywords = {Data stream mining, Random sampling with replacement, Reservoir sampling}, @@ -51,7 +51,7 @@ @article{efraimidis2006weighted pages = {181-185}, year = {2006}, issn = {0020-0190}, -doi = {https://doi.org/10.1016/j.ipl.2005.11.003}, +doi = {10.1016/j.ipl.2005.11.003}, url = {https://www.sciencedirect.com/science/article/pii/S002001900500298X}, author = {Pavlos S. Efraimidis and Paul G. Spirakis}, keywords = {Weighted random sampling, Reservoir sampling, Randomized algorithms, Data streams, Parallel algorithms}, @@ -91,7 +91,6 @@ @InProceedings{shekelyan2021sequential booktitle = {Proceedings of The 24th International Conference on Artificial Intelligence and Statistics}, pages = {3628--3636}, year = {2021}, - editor = {Banerjee, Arindam and Fukumizu, Kenji}, volume = {130}, series = {Proceedings of Machine Learning Research}, publisher = {PMLR}, @@ -206,14 +205,13 @@ @INPROCEEDINGS{wolfrath2022 @InProceedings{lee2010stratified, author="Al-Kateb, Mohammed and Lee, Byung Suk", -editor="Gertz, Michael -and Lud{\"a}scher, Bertram", title="Stratified Reservoir Sampling over Heterogeneous Data Streams", booktitle="Scientific and Statistical Database Management", year="2010", publisher="Springer Berlin Heidelberg", pages="621--639", -isbn="978-3-642-13818-8" +isbn="978-3-642-13818-8", +doi="10.1007/978-3-642-13818-8_42", } @article{braverman2012optimal, @@ -225,8 +223,8 @@ @article{braverman2012optimal year = {2012}, note = {JCSS Knowledge Representation and Reasoning}, issn = {0022-0000}, -doi = {https://doi.org/10.1016/j.jcss.2011.04.004}, +doi = {10.1016/j.jcss.2011.04.004}, url = {https://www.sciencedirect.com/science/article/pii/S0022000011000493}, author = {Vladimir Braverman and Rafail Ostrovsky and Carlo Zaniolo}, keywords = {Data streams, Random sampling, Sliding windows}, -} \ No newline at end of file +} From e8cd5377a1fac2b6ce4a6ec5ae5a9dae696239f7 Mon Sep 17 00:00:00 2001 From: Adriano Meligrana <68152031+ameligrana@users.noreply.github.com> Date: Wed, 1 Apr 2026 00:17:01 +0200 Subject: [PATCH 2/2] Rename section from 'Related Work' to 'Statement of Need' --- paper/paper.tex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper/paper.tex b/paper/paper.tex index 65fe28b..f39353d 100644 --- a/paper/paper.tex +++ b/paper/paper.tex @@ -24,7 +24,7 @@ \section{Introduction} Sequential sampling algorithms, by contrast, are typically used when either the total number of elements in the stream ($N$) or the exact total weight ($W_N$) is known in advance \cite{vitter1987efficient, shekelyan2021sequential}. These methods return an ordered sample of the stream, without keeping track of previous sampled elements. A sequential algorithm can compute, from a single random variate, how many elements to skip over before the next selection, without the need to keep a reservoir. This can make sequential methods more efficient than reservoir methods when prior knowledge of $N$ or $W_N$ is available. \texttt{StreamSampling.jl} provides a comprehensive native Julia suite of algorithms covering both categories. However, a fundamental advantage of reservoir methods over sequential methods is that they maintain a representative sample in memory at all times, even when only a portion of the $N$ elements have been processed, a property that sequential methods lack by design. -\section{Related Work} +\section{Statement of Need} Online sampling techniques have been implemented across a range of programming languages and frameworks, typically addressing specific use cases rather than providing a unified suite of algorithms. For instance, eBay's \texttt{tsv-utils} provide the \texttt{tsv-sample} command-line tool, which supports simple random sampling and weighted reservoir sampling over tabular streams, as well as Bernoulli sampling and distinct sampling for streaming scenarios. In the POSIX environment, the GNU Coreutils \texttt{shuf} binary \cite{gnucoreutilsshuf} implements unweighted reservoir sampling when the \texttt{-n} option is used to extract a fixed number of lines from potentially unbounded inputs, without needing to know the total line count in advance. @@ -63,7 +63,7 @@ \section{Implemented Methods} \hline \end{tabular}% } -\caption{Sampling algorithms implemented in \texttt{StreamSampling.jl} ($N$=population size, $K$=sample size, $W_N$=total weight). $^*$The expected $\mathcal{O}(K \log(N/K))$ complexity for \texttt{AlgAExpJ} holds when weights are drawn i.i.d. from a fixed distribution \cite{efraimidis2006weighted}.} +\caption{Sampling algorithms implemented in \texttt{StreamSampling.jl} ($N$=population size, $K$=sample size, $W_N$=total weight). $^*$The expected $\mathcal{O}(K \log(N/K))$ complexity for \texttt{AlgAExpJ} holds when weights are independent and identically distributed (i.i.d.) draws \cite{efraimidis2006weighted}.} \label{tab:methods} \end{table}