Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 20 additions & 27 deletions encodings/fsst/src/tests.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use rand::SeedableRng;
use rand::rngs::StdRng;
use rand::seq::IndexedRandom;
use fsst::CompressorBuilder;
use vortex_array::ArrayRef;
use vortex_array::IntoArray;
use vortex_array::LEGACY_SESSION;
Expand Down Expand Up @@ -118,12 +116,17 @@ fn test_fsst_array_ops() {
/// [`VarBinBuilder<i32>`] for the FSST output and panicked in
/// [`VarBinBuilder::append_value`] once cumulative compressed bytes crossed the boundary.
///
/// The input is built with [`VarBinBuilder<i64>`] so the input itself does not panic, which
/// confirms the overflow is on the FSST output side. After the fix the test must succeed
/// with the row count preserved.
/// The compressor is built with an empty symbol table, so every input byte misses the table
/// and is escape-coded at exactly two output bytes per input byte. This makes the output
/// size deterministic and crosses the i32 boundary with roughly half the input bytes that
/// an incompressible-input construction would need. The input is built with
/// [`VarBinBuilder<i64>`] so the input itself does not panic, which confirms the overflow
/// is on the FSST output side. After the fix the test must succeed with the row count
/// preserved.
///
/// Allocates ~2.5 GiB for the input and ~2.5 GiB for the FSST output (~5 GiB total), so it
/// is gated to CI runs and skipped when `VORTEX_SKIP_SLOW_TESTS` is set. To run it locally:
/// Allocates ~1.1 GiB for the input and ~2.1 GiB for the FSST output (~3.2 GiB total), so
/// it is gated to CI runs and skipped when `VORTEX_SKIP_SLOW_TESTS` is set. To run it
/// locally:
///
/// ```text
/// CI=1 cargo test --release -p vortex-fsst fsst_compress_offsets
Expand All @@ -133,38 +136,28 @@ fn test_fsst_array_ops() {
#[test_with::env(CI)]
#[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)]
fn fsst_compress_offsets_overflow_i32() {
// High-entropy ASCII strings sliced from a random pool. FSST is a symbol-table
// compressor; pseudo-random data with no recurring byte sequences resists compression,
// so the compressed output stays close to input size and crosses the i32 boundary.
const STRING_LEN: usize = 64 * 1024;
const TOTAL_BYTES: usize = (1usize << 31) + (512 << 20); // ~2.5 GiB
// Escape coding doubles every byte, so ~1.06 GiB of input compresses to ~2.13 GiB,
// comfortably past i32::MAX.
const TOTAL_BYTES: usize = (1usize << 30) + (64 << 20);
const N: usize = TOTAL_BYTES / STRING_LEN;
const POOL_LEN: usize = 64 * 1024 * 1024;

// Printable ASCII alphabet so the result is valid UTF-8.
const ALPHABET: &[u8; 95] =
b" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";

let mut rng = StdRng::seed_from_u64(0xC0DE_C011_B711);
let pool: Vec<u8> = (0..POOL_LEN)
.map(|_| *ALPHABET.choose(&mut rng).unwrap())
.collect();

println!("building large VarBinArray");
let string = vec![b'a'; STRING_LEN];
let mut builder = VarBinBuilder::<i64>::with_capacity(N);
for i in 0..N {
let off = i.wrapping_mul(31337) % (POOL_LEN - STRING_LEN);
builder.append_value(&pool[off..off + STRING_LEN]);
for _ in 0..N {
builder.append_value(&string);
}
let array = builder.finish(DType::Utf8(Nullability::NonNullable));

println!("training FSST compressor");
let compressor = fsst_train_compressor(&array);
let compressor = CompressorBuilder::default().build();
let len = array.len();
let dtype = array.dtype().clone();
let mut ctx = LEGACY_SESSION.create_execution_ctx();

println!("compressing to FSST");
let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx);
assert_eq!(compressed.len(), len);
// Prove the regression condition was exercised: compressed bytes crossed i32::MAX.
assert!(compressed.codes_bytes().len() > i32::MAX as usize);
}
Loading