From c00b58c3c5c73c4c407aaac83cf3a051e82f0443 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 9 Jun 2026 20:48:57 +0100 Subject: [PATCH] Optimize `fsst_compress_offsets_overflow_i32` to under a minute locally Signed-off-by: Adam Gutglick --- encodings/fsst/src/tests.rs | 47 ++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/encodings/fsst/src/tests.rs b/encodings/fsst/src/tests.rs index c4dca57c61b..481b84865df 100644 --- a/encodings/fsst/src/tests.rs +++ b/encodings/fsst/src/tests.rs @@ -1,9 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use rand::SeedableRng; -use rand::rngs::StdRng; -use rand::seq::IndexedRandom; +use fsst::CompressorBuilder; use vortex_array::ArrayRef; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; @@ -118,12 +116,17 @@ fn test_fsst_array_ops() { /// [`VarBinBuilder`] for the FSST output and panicked in /// [`VarBinBuilder::append_value`] once cumulative compressed bytes crossed the boundary. /// -/// The input is built with [`VarBinBuilder`] so the input itself does not panic, which -/// confirms the overflow is on the FSST output side. After the fix the test must succeed -/// with the row count preserved. +/// The compressor is built with an empty symbol table, so every input byte misses the table +/// and is escape-coded at exactly two output bytes per input byte. This makes the output +/// size deterministic and crosses the i32 boundary with roughly half the input bytes that +/// an incompressible-input construction would need. The input is built with +/// [`VarBinBuilder`] so the input itself does not panic, which confirms the overflow +/// is on the FSST output side. After the fix the test must succeed with the row count +/// preserved. /// -/// Allocates ~2.5 GiB for the input and ~2.5 GiB for the FSST output (~5 GiB total), so it -/// is gated to CI runs and skipped when `VORTEX_SKIP_SLOW_TESTS` is set. To run it locally: +/// Allocates ~1.1 GiB for the input and ~2.1 GiB for the FSST output (~3.2 GiB total), so +/// it is gated to CI runs and skipped when `VORTEX_SKIP_SLOW_TESTS` is set. To run it +/// locally: /// /// ```text /// CI=1 cargo test --release -p vortex-fsst fsst_compress_offsets @@ -133,33 +136,21 @@ fn test_fsst_array_ops() { #[test_with::env(CI)] #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] fn fsst_compress_offsets_overflow_i32() { - // High-entropy ASCII strings sliced from a random pool. FSST is a symbol-table - // compressor; pseudo-random data with no recurring byte sequences resists compression, - // so the compressed output stays close to input size and crosses the i32 boundary. const STRING_LEN: usize = 64 * 1024; - const TOTAL_BYTES: usize = (1usize << 31) + (512 << 20); // ~2.5 GiB + // Escape coding doubles every byte, so ~1.06 GiB of input compresses to ~2.13 GiB, + // comfortably past i32::MAX. + const TOTAL_BYTES: usize = (1usize << 30) + (64 << 20); const N: usize = TOTAL_BYTES / STRING_LEN; - const POOL_LEN: usize = 64 * 1024 * 1024; - - // Printable ASCII alphabet so the result is valid UTF-8. - const ALPHABET: &[u8; 95] = - b" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; - - let mut rng = StdRng::seed_from_u64(0xC0DE_C011_B711); - let pool: Vec = (0..POOL_LEN) - .map(|_| *ALPHABET.choose(&mut rng).unwrap()) - .collect(); println!("building large VarBinArray"); + let string = vec![b'a'; STRING_LEN]; let mut builder = VarBinBuilder::::with_capacity(N); - for i in 0..N { - let off = i.wrapping_mul(31337) % (POOL_LEN - STRING_LEN); - builder.append_value(&pool[off..off + STRING_LEN]); + for _ in 0..N { + builder.append_value(&string); } let array = builder.finish(DType::Utf8(Nullability::NonNullable)); - println!("training FSST compressor"); - let compressor = fsst_train_compressor(&array); + let compressor = CompressorBuilder::default().build(); let len = array.len(); let dtype = array.dtype().clone(); let mut ctx = LEGACY_SESSION.create_execution_ctx(); @@ -167,4 +158,6 @@ fn fsst_compress_offsets_overflow_i32() { println!("compressing to FSST"); let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx); assert_eq!(compressed.len(), len); + // Prove the regression condition was exercised: compressed bytes crossed i32::MAX. + assert!(compressed.codes_bytes().len() > i32::MAX as usize); }