diff --git a/.github/workflows/publish-s390x-unknown-linux-gnu.yml b/.github/workflows/publish-s390x-unknown-linux-gnu.yml index 71d9d6ed5..b73748f47 100644 --- a/.github/workflows/publish-s390x-unknown-linux-gnu.yml +++ b/.github/workflows/publish-s390x-unknown-linux-gnu.yml @@ -1,5 +1,14 @@ name: Publish s390x +# Apache DataSketches (datasketches crate) is target-conditional and excluded +# on big-endian targets — see Cargo.toml's +# `[target.'cfg(not(target_endian = "big"))'.dependencies]` block. As a result, +# the s390x binaries built here do NOT include the t-digest / HyperLogLog / +# Frequent Items code paths: `stats --quantile-method approx`, +# `stats --cardinality-method approx`, and `frequency --sketch-method frequent_items` +# all return a "requires a little-endian target" error on s390x. The exact-method +# defaults are unaffected. + on: # push: # tags: diff --git a/.github/workflows/rust-s390x.yml b/.github/workflows/rust-s390x.yml index a1e3ebff5..0d3113e5c 100644 --- a/.github/workflows/rust-s390x.yml +++ b/.github/workflows/rust-s390x.yml @@ -42,5 +42,24 @@ jobs: - name: Run tests env: RUSTFLAGS: -C target-cpu=native + # Apache DataSketches (datasketches crate) is excluded on big-endian targets + # because its upstream C++ does not support big-endian. The qsv source gates + # the `stats --quantile-method approx`, `stats --cardinality-method approx`, + # and `frequency --sketch-method frequent_items` codepaths behind + # `#[cfg(not(target_endian = "big"))]` and the CLI rejects those flags at + # runtime on s390x with a clear platform error. The integration tests below + # exercise those flags and so are skipped here; the surrounding `_invalid_value_` + # / `_exact_` tests still run and cover the non-sketch paths. + # `frequency_sketch_method_invalid_map_size_is_rejected` is also skipped: + # it asserts a "power of two" error, but on s390x the new platform-rejection + # error fires before --sketch-map-size validation, so the assertion would no + # longer match. The dedicated `*_big_endian_*_rejected` tests in + # tests/test_stats.rs / tests/test_frequency.rs (compiled in only on + # big-endian) cover the platform-rejection error message instead. # run: cargo test s390x --verbose --locked --features=apply,fetch,foreach,luau,python,feature_capable,ui -- --nocapture - run: cargo test --verbose --locked --features=apply,fetch,foreach,luau,python,feature_capable,ui + run: | + cargo test --verbose --locked --features=apply,fetch,foreach,luau,python,feature_capable,ui -- \ + --skip stats_quantile_method_approx_ \ + --skip stats_cardinality_method_approx_ \ + --skip frequency_sketch_method_frequent_items_ \ + --skip frequency_sketch_method_invalid_map_size_is_rejected diff --git a/Cargo.toml b/Cargo.toml index 643611348..ca5ea6856 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -118,7 +118,6 @@ csvlens = { version = "0.15", optional = true, default-features = false, feature csvs_convert = { version = "0.12", default-features = false, features = [ "converters", ], optional = true } -datasketches = "0.2" dns-lookup = { version = "3", optional = true } directories = "6.0" dotenvy = "0.15" @@ -376,6 +375,12 @@ whatlang = { git = "https://github.com/jqnatividad/whatlang-rs", branch = "bump- # polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.40.1", optional = true } polars = { git = "https://github.com/pola-rs/polars", rev = "c89ecf7", optional = true } +# Apache DataSketches (t-digest, HLL, Frequent Items) — upstream C++ does not +# support big-endian targets, so we exclude the crate on big-endian builds. +# Code paths that use it are guarded by #[cfg(not(target_endian = "big"))]. +[target.'cfg(not(target_endian = "big"))'.dependencies] +datasketches = "0.2" + [features] default = ["jemallocator"] distrib_features = [ diff --git a/src/cmd/frequency.rs b/src/cmd/frequency.rs index 6460a5d1c..2d48c8d8b 100644 --- a/src/cmd/frequency.rs +++ b/src/cmd/frequency.rs @@ -107,6 +107,10 @@ frequency options: since the sketch cannot recover the true count of items not in the top-K); rank is 0 to match the exact convention. + Note: 'frequent_items' requires a little-endian + target. Apache DataSketches does not support + big-endian platforms (e.g., s390x); on those + builds, this choice is rejected. [default: exact] --sketch-map-size Maximum map size for the Frequent Items sketch. Must be a power of two and at least 8. Larger values @@ -682,6 +686,11 @@ fn calculate_memory_aware_chunk_size_for_frequency( /// /// Returns `false` if any conflicting flag is set, if the user explicitly set /// the method (regardless of value), or if --sketch-map-size is invalid. +/// +/// On big-endian targets the Apache DataSketches port is unavailable, so this +/// function compiles to a stub that always returns `false`. The OOM auto-enable +/// path then leaves `flag_sketch_method` alone and the error propagates. +#[cfg(not(target_endian = "big"))] fn can_enable_frequent_items(args: &Args, user_set_sketch_method: bool) -> bool { if args.flag_sketch_method != "exact" || user_set_sketch_method { return false; @@ -708,6 +717,14 @@ fn can_enable_frequent_items(args: &Args, user_set_sketch_method: bool) -> bool args.flag_sketch_map_size >= 8 && args.flag_sketch_map_size.is_power_of_two() } +/// Big-endian stub: Apache DataSketches is unavailable on big-endian targets, +/// so the Frequent Items sketch cannot be auto-enabled. The OOM path falls +/// through to returning the original error. +#[cfg(target_endian = "big")] +fn can_enable_frequent_items(_args: &Args, _user_set_sketch_method: bool) -> bool { + false +} + pub fn run(argv: &[&str]) -> CliResult<()> { let mut args: Args = util::get_args(USAGE, argv)?; @@ -758,6 +775,18 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.flag_sketch_method = args.flag_sketch_method.to_lowercase(); match args.flag_sketch_method.as_str() { "exact" => {}, + // Apache DataSketches is unavailable on big-endian targets, so the + // Frequent Items sketch cannot run there. Reject upfront with a clear + // platform message instead of falling through to the normal validation. + #[cfg(target_endian = "big")] + "frequent_items" => { + return fail_incorrectusage_clierror!( + "--sketch-method frequent_items requires a little-endian target. Apache \ + DataSketches is not available on big-endian platforms (e.g., s390x). Use \ + --sketch-method exact." + ); + }, + #[cfg(not(target_endian = "big"))] "frequent_items" => { if args.flag_asc { return fail_incorrectusage_clierror!( @@ -884,6 +913,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // run_frequent_items doesn't consult flag_sketch_method, so the field // stays at "exact" and that's fine (cache key / diagnostics use other // paths). Verified via grep over run_frequent_items's body. + // + // Apache DataSketches is unavailable on big-endian targets, so the sketch + // fallback is gated out entirely there. The big-endian branch below + // preserves the "no index → propagate the error" behavior. + #[cfg(not(target_endian = "big"))] if matches!(e, crate::CliError::OutOfMemory(_)) && can_enable_frequent_items(&args, user_set_sketch_method) { @@ -897,6 +931,17 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } else if !index_succeeded { return Err(e); } + // Big-endian: no sketch auto-enable is possible (DataSketches is + // compiled out), so the only recovery left is the index that may + // have been created above. If indexing did NOT succeed, propagate + // the original error. If it DID succeed, silently fall through and + // retry with parallel processing — matching the little-endian + // behavior on the same branch (the original `else if !index_succeeded` + // arm above) where success similarly swallows `e`. + #[cfg(target_endian = "big")] + if !index_succeeded { + return Err(e); + } }, } } @@ -3822,6 +3867,7 @@ impl Args { /// bounded by `sketch.maximum_error()` (which equals the stream length minus the active /// threshold). The "Other" row's count is `total_weight - sum(top_k_estimates)` and is /// therefore approximate. + #[cfg(not(target_endian = "big"))] #[allow(clippy::cast_precision_loss)] fn run_frequent_items(&self, rconfig: &Config) -> CliResult<()> { use datasketches::frequencies::{ErrorType, FrequentItemsSketch}; diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 46a9dc178..8cdac0729 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -215,6 +215,10 @@ stats options: crate does not expose weighted-update. * Results may differ slightly across runs with different --jobs values. + * Requires a little-endian target. Apache + DataSketches does not support big-endian + platforms (e.g., s390x); on those builds, + this choice is rejected. [default: exact] --cardinality-method Algorithm used to compute the --cardinality column. Choices: exact - track every unique value in a HashMap/Unsorted @@ -238,6 +242,10 @@ stats options: HLL union used at merge time is associative and order-invariant, so chunk completion order does not affect the final estimate. + * Requires a little-endian target. Apache + DataSketches does not support big-endian + platforms (e.g., s390x); on those builds, + this choice is rejected. [default: exact] --mode-cardinality-cap Bound mode-tracking memory on high-cardinality columns. When > 0, if a column's mode tracker grows past , qsv @@ -768,6 +776,11 @@ const MAX_STAT_COLUMNS: usize = 47; // representation. Used in `Stats::new` (per-row sketch construction) and in // `Commute::merge` (the transient `HllUnion` used to combine two sketches); // keep both call sites in lock-step by reading from this constant. +// +// Gated to little-endian: the only consumers are `datasketches::hll` call sites, +// which are themselves gated. Defining it unconditionally would trip an +// `unused-const` lint on big-endian builds. +#[cfg(not(target_endian = "big"))] const HLL_LG_K: u8 = 12; // the first N columns of each full stats record are used for the dataset @@ -993,6 +1006,12 @@ fn parse_boolean_patterns(boolean_patterns: &str) -> CliResult Vec<&'static str> { + Vec::new() +} + /// Main entry point for the stats command. /// /// This function orchestrates the entire CSV statistics computation process, including @@ -1125,6 +1156,17 @@ pub fn run(argv: &[&str]) -> CliResult<()> { }, }; + // Apache DataSketches (t-digest) is unavailable on big-endian targets, so + // --quantile-method approx cannot be honored there. Reject up front so the + // downstream code can assume `approx_quantiles == false` on big-endian. + #[cfg(target_endian = "big")] + if approx_quantiles { + return fail_incorrectusage_clierror!( + "--quantile-method approx requires a little-endian target. Apache DataSketches is not \ + available on big-endian platforms (e.g., s390x). Use --quantile-method exact." + ); + } + // approx quantiles are not yet supported with --weight: the upstream // datasketches::tdigest crate does not expose a weighted-update API. if approx_quantiles && args.flag_weight.is_some() { @@ -1158,6 +1200,17 @@ pub fn run(argv: &[&str]) -> CliResult<()> { }, } + // Apache DataSketches (HyperLogLog) is unavailable on big-endian targets, so + // --cardinality-method approx cannot be honored there. Reject up front so the + // downstream code can assume `flag_cardinality_method != "approx"` on big-endian. + #[cfg(target_endian = "big")] + if args.flag_cardinality_method == "approx" { + return fail_incorrectusage_clierror!( + "--cardinality-method approx requires a little-endian target. Apache DataSketches is \ + not available on big-endian platforms (e.g., s390x). Use --cardinality-method exact." + ); + } + // inferring boolean requires inferring cardinality if args.flag_infer_boolean { if !args.flag_cardinality { @@ -2950,9 +3003,11 @@ impl WhichStats { /// `Stats::tdigest`, so the derived `Serialize`/`Deserialize` on `Stats` skip this field /// entirely. T-digest state is intermediate; cache invalidation on `--quantile-method` /// change rides on `StatsArgs` serialization instead. +#[cfg(not(target_endian = "big"))] #[derive(Default)] struct TDigestSlot(Option); +#[cfg(not(target_endian = "big"))] impl Clone for TDigestSlot { #[inline] fn clone(&self) -> Self { @@ -2960,6 +3015,7 @@ impl Clone for TDigestSlot { } } +#[cfg(not(target_endian = "big"))] impl PartialEq for TDigestSlot { #[inline] fn eq(&self, _other: &Self) -> bool { @@ -2967,6 +3023,14 @@ impl PartialEq for TDigestSlot { } } +/// Big-endian fallback for `TDigestSlot`. Apache DataSketches is unavailable on +/// big-endian targets, so we substitute a zero-sized type. `Stats::tdigest` is +/// still declared (and `#[serde(skip)]`) on those targets but is never read; the +/// `--quantile-method approx` codepath is rejected upstream by `run()`. +#[cfg(target_endian = "big")] +#[derive(Default, Clone, PartialEq)] +struct TDigestSlot; + /// Wrapper around `datasketches::hll::HllSketch` so the `Stats` struct can keep its /// derived `Clone`, `PartialEq`, `Serialize`, `Deserialize` impls without forcing the /// upstream `HllSketch` (which only derives `Clone`/`Debug`) to grow them. @@ -2980,9 +3044,11 @@ impl PartialEq for TDigestSlot { /// `Stats::hll`, so the derived `Serialize`/`Deserialize` on `Stats` skip this field /// entirely. HLL state is intermediate; cache invalidation on `--cardinality-method` /// change rides on `StatsArgs` serialization instead. +#[cfg(not(target_endian = "big"))] #[derive(Default)] struct HllSlot(Option); +#[cfg(not(target_endian = "big"))] impl Clone for HllSlot { #[inline] fn clone(&self) -> Self { @@ -2990,6 +3056,7 @@ impl Clone for HllSlot { } } +#[cfg(not(target_endian = "big"))] impl PartialEq for HllSlot { #[inline] fn eq(&self, _other: &Self) -> bool { @@ -2997,6 +3064,14 @@ impl PartialEq for HllSlot { } } +/// Big-endian fallback for `HllSlot`. Apache DataSketches is unavailable on +/// big-endian targets, so we substitute a zero-sized type. `Stats::hll` is +/// still declared (and `#[serde(skip)]`) on those targets but is never read; the +/// `--cardinality-method approx` codepath is rejected upstream by `run()`. +#[cfg(target_endian = "big")] +#[derive(Default, Clone, PartialEq)] +struct HllSlot; + #[allow(clippy::unsafe_derive_deserialize)] #[allow(clippy::struct_field_names)] #[repr(C, align(64))] // Align to cache line size for better performance @@ -3581,9 +3656,18 @@ impl Stats { // unless --quantile-method approx is set, in which case we route values to a // t-digest instead. The two engines are mutually exclusive for a given Stats // instance. + // + // The `if which.approx_quantiles` branch is compiled out entirely on + // big-endian: Apache DataSketches is unavailable there, so even an + // internal caller that constructs `WhichStats { approx_quantiles: true, + // .. }` directly (bypassing the CLI rejection in `run()`) takes the + // exact path. This is stronger than gating only the allocation expression + // with `cfg_select!`+`unreachable!()` — there is no runtime panic path. let needs_quantiles = which.quartiles || which.median || which.mad || which.percentiles; + #[cfg_attr(target_endian = "big", allow(unused_mut))] let mut tdigest = TDigestSlot::default(); if needs_quantiles { + #[cfg(not(target_endian = "big"))] if which.approx_quantiles { // k=200 is the upstream default; ~1% rank error, more accurate at the tails. tdigest = TDigestSlot(Some(datasketches::tdigest::TDigestMut::new(200))); @@ -3593,11 +3677,24 @@ impl Stats { weighted_unsorted_stats = Some(Vec::with_capacity(record_count)); } } + #[cfg(target_endian = "big")] + { + // Exact path is the only path here: t-digest is compiled out. + unsorted_stats = Some(stats::Unsorted::with_capacity(record_count)); + if use_weights { + weighted_unsorted_stats = Some(Vec::with_capacity(record_count)); + } + } } // HyperLogLog cardinality engine: allocate when --cardinality-method approx // is selected AND cardinality output is enabled. HLL_LG_K (12) gives ~1.5% // RSE with ~5KB per column. Hll8 stores 8 bits per register (no decode work // on update); memory is the same order whether sparse or dense. + // + // Compiled out entirely on big-endian — Apache DataSketches is unavailable, + // so `hll` is always the zero-sized default there (`HllSlot` is a + // unit-like ZST on big-endian). + #[cfg(not(target_endian = "big"))] let hll = if which.cardinality && which.approx_cardinality { HllSlot(Some(datasketches::hll::HllSketch::new( HLL_LG_K, @@ -3606,6 +3703,8 @@ impl Stats { } else { HllSlot::default() }; + #[cfg(target_endian = "big")] + let hll = HllSlot::default(); Stats { typ: FieldType::default(), is_ascii: true, @@ -4896,6 +4995,10 @@ impl Commute for Stats { // at all — see the dispatch in run() around line 1456) and is therefore exactly // reproducible. Determinism for --jobs >= 2 is intentionally not guaranteed; the // --quantile-method help text documents the caveat for users. + // + // Skipped on big-endian targets: `TDigestSlot` is a unit-like ZST there (Apache + // DataSketches is unavailable), so there's no inner state to merge. + #[cfg(not(target_endian = "big"))] match (&mut self.tdigest.0, other.tdigest.0) { (Some(s), Some(o)) => s.merge(&o), (slot @ None, Some(o)) => *slot = Some(o), @@ -4907,6 +5010,10 @@ impl Commute for Stats { // for the cardinality column under --cardinality-method approx. The lg_k // here MUST match the HLL_LG_K used in `Stats::new`; reading from the same // constant prevents a silent precision downgrade if one site is bumped. + // + // Skipped on big-endian targets: `HllSlot` is a unit-like ZST there (Apache + // DataSketches is unavailable), so there's no inner state to merge. + #[cfg(not(target_endian = "big"))] match (&mut self.hll.0, other.hll.0) { (Some(s), Some(o)) => { let mut union = datasketches::hll::HllUnion::new(HLL_LG_K); diff --git a/tests/test_frequency.rs b/tests/test_frequency.rs index d5e5d17bb..df8f8d56b 100644 --- a/tests/test_frequency.rs +++ b/tests/test_frequency.rs @@ -6778,3 +6778,41 @@ fn frequency_oom_skips_when_json_output() { "frequent_items sketch must NOT auto-enable when --json is set, got: {stderr}" ); } + +// --- big-endian platform-rejection test ---------------------------------------------- +// +// Apache DataSketches does not support big-endian targets, so the qsv source +// gates the Frequent Items sketch behind `#[cfg(not(target_endian = "big"))]` +// and `run()` rejects `--sketch-method frequent_items` at the top of the +// command with a clear platform error message naming s390x. This test is +// only compiled on big-endian targets; on little-endian the flag works +// normally and the existing `frequency_sketch_method_frequent_items_*` tests +// already cover that path. +// +// Name intentionally avoids the `frequency_sketch_method_frequent_items_` +// prefix so the s390x CI `--skip` filter does not drop it. + +#[cfg(target_endian = "big")] +#[test] +fn frequency_big_endian_sketch_method_frequent_items_rejected() { + let wrk = Workdir::new("frequency_big_endian_sketch_method_frequent_items_rejected"); + wrk.create("data.csv", vec![svec!["v"], svec!["a"], svec!["b"]]); + + let mut cmd = wrk.command("frequency"); + cmd.arg("--sketch-method") + .arg("frequent_items") + .arg("data.csv"); + + let output = cmd.output().unwrap(); + assert!( + !output.status.success(), + "--sketch-method frequent_items should be rejected on big-endian" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("--sketch-method frequent_items") + && stderr.contains("requires a little-endian target") + && stderr.contains("s390x"), + "error should name the flag, the platform constraint, and s390x; got: {stderr}" + ); +} diff --git a/tests/test_stats.rs b/tests/test_stats.rs index 7b7e2107d..ed406e159 100644 --- a/tests/test_stats.rs +++ b/tests/test_stats.rs @@ -6814,3 +6814,68 @@ fn stats_cardinality_method_invalid_value_is_rejected() { "error should mention the bad flag and value, got: {stderr}" ); } + +// --- big-endian platform-rejection tests --------------------------------------------- +// +// Apache DataSketches does not support big-endian targets, so the qsv source +// gates t-digest / HyperLogLog behind `#[cfg(not(target_endian = "big"))]` +// and `run()` rejects `--quantile-method approx` / `--cardinality-method approx` +// at the top of the command with a clear platform error message naming s390x. +// These tests are only compiled on big-endian targets; on little-endian they +// would always fail (the flags work normally there) and the existing +// `*_method_approx_*` tests already cover that path. +// +// Names intentionally avoid the `stats_*_method_approx_` prefix so the s390x +// CI `--skip` filters do not drop them. + +#[cfg(target_endian = "big")] +#[test] +fn stats_big_endian_quantile_method_approx_rejected() { + let wrk = Workdir::new("stats_big_endian_quantile_method_approx_rejected"); + wrk.create("data.csv", vec![svec!["value"], svec!["1"], svec!["2"]]); + + let mut cmd = wrk.command("stats"); + cmd.arg("--quantiles") + .arg("--quantile-method") + .arg("approx") + .arg("data.csv"); + + let output = cmd.output().unwrap(); + assert!( + !output.status.success(), + "--quantile-method approx should be rejected on big-endian" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("--quantile-method approx") + && stderr.contains("requires a little-endian target") + && stderr.contains("s390x"), + "error should name the flag, the platform constraint, and s390x; got: {stderr}" + ); +} + +#[cfg(target_endian = "big")] +#[test] +fn stats_big_endian_cardinality_method_approx_rejected() { + let wrk = Workdir::new("stats_big_endian_cardinality_method_approx_rejected"); + wrk.create("data.csv", vec![svec!["value"], svec!["1"], svec!["2"]]); + + let mut cmd = wrk.command("stats"); + cmd.arg("--cardinality") + .arg("--cardinality-method") + .arg("approx") + .arg("data.csv"); + + let output = cmd.output().unwrap(); + assert!( + !output.status.success(), + "--cardinality-method approx should be rejected on big-endian" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("--cardinality-method approx") + && stderr.contains("requires a little-endian target") + && stderr.contains("s390x"), + "error should name the flag, the platform constraint, and s390x; got: {stderr}" + ); +}