Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/publish-s390x-unknown-linux-gnu.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
name: Publish s390x

# Apache DataSketches (datasketches crate) is target-conditional and excluded
# on big-endian targets — see Cargo.toml's
# `[target.'cfg(not(target_endian = "big"))'.dependencies]` block. As a result,
# the s390x binaries built here do NOT include the t-digest / HyperLogLog /
# Frequent Items code paths: `stats --quantile-method approx`,
# `stats --cardinality-method approx`, and `frequency --sketch-method frequent_items`
# all return a "requires a little-endian target" error on s390x. The exact-method
# defaults are unaffected.

on:
# push:
# tags:
Expand Down
21 changes: 20 additions & 1 deletion .github/workflows/rust-s390x.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,24 @@ jobs:
- name: Run tests
env:
RUSTFLAGS: -C target-cpu=native
# Apache DataSketches (datasketches crate) is excluded on big-endian targets
# because its upstream C++ does not support big-endian. The qsv source gates
# the `stats --quantile-method approx`, `stats --cardinality-method approx`,
# and `frequency --sketch-method frequent_items` codepaths behind
# `#[cfg(not(target_endian = "big"))]` and the CLI rejects those flags at
# runtime on s390x with a clear platform error. The integration tests below
# exercise those flags and so are skipped here; the surrounding `_invalid_value_`
# / `_exact_` tests still run and cover the non-sketch paths.
# `frequency_sketch_method_invalid_map_size_is_rejected` is also skipped:
# it asserts a "power of two" error, but on s390x the new platform-rejection
# error fires before --sketch-map-size validation, so the assertion would no
# longer match. The dedicated `*_big_endian_*_rejected` tests in
# tests/test_stats.rs / tests/test_frequency.rs (compiled in only on
# big-endian) cover the platform-rejection error message instead.
# run: cargo test s390x --verbose --locked --features=apply,fetch,foreach,luau,python,feature_capable,ui -- --nocapture
run: cargo test --verbose --locked --features=apply,fetch,foreach,luau,python,feature_capable,ui
run: |
cargo test --verbose --locked --features=apply,fetch,foreach,luau,python,feature_capable,ui -- \
--skip stats_quantile_method_approx_ \
--skip stats_cardinality_method_approx_ \
--skip frequency_sketch_method_frequent_items_ \
--skip frequency_sketch_method_invalid_map_size_is_rejected
7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ csvlens = { version = "0.15", optional = true, default-features = false, feature
csvs_convert = { version = "0.12", default-features = false, features = [
"converters",
], optional = true }
datasketches = "0.2"
dns-lookup = { version = "3", optional = true }
directories = "6.0"
dotenvy = "0.15"
Expand Down Expand Up @@ -376,6 +375,12 @@ whatlang = { git = "https://github.com/jqnatividad/whatlang-rs", branch = "bump-
# polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.40.1", optional = true }
polars = { git = "https://github.com/pola-rs/polars", rev = "c89ecf7", optional = true }

# Apache DataSketches (t-digest, HLL, Frequent Items) — upstream C++ does not
# support big-endian targets, so we exclude the crate on big-endian builds.
# Code paths that use it are guarded by #[cfg(not(target_endian = "big"))].
[target.'cfg(not(target_endian = "big"))'.dependencies]
datasketches = "0.2"

[features]
default = ["jemallocator"]
distrib_features = [
Expand Down
46 changes: 46 additions & 0 deletions src/cmd/frequency.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ frequency options:
since the sketch cannot recover the true count of
items not in the top-K); rank is 0 to match the
exact convention.
Note: 'frequent_items' requires a little-endian
target. Apache DataSketches does not support
big-endian platforms (e.g., s390x); on those
builds, this choice is rejected.
[default: exact]
--sketch-map-size <n> Maximum map size for the Frequent Items sketch.
Must be a power of two and at least 8. Larger values
Expand Down Expand Up @@ -682,6 +686,11 @@ fn calculate_memory_aware_chunk_size_for_frequency(
///
/// Returns `false` if any conflicting flag is set, if the user explicitly set
/// the method (regardless of value), or if --sketch-map-size is invalid.
///
/// On big-endian targets the Apache DataSketches port is unavailable, so this
/// function compiles to a stub that always returns `false`. The OOM auto-enable
/// path then leaves `flag_sketch_method` alone and the error propagates.
#[cfg(not(target_endian = "big"))]
fn can_enable_frequent_items(args: &Args, user_set_sketch_method: bool) -> bool {
if args.flag_sketch_method != "exact" || user_set_sketch_method {
return false;
Expand All @@ -708,6 +717,14 @@ fn can_enable_frequent_items(args: &Args, user_set_sketch_method: bool) -> bool
args.flag_sketch_map_size >= 8 && args.flag_sketch_map_size.is_power_of_two()
}

/// Big-endian stub: Apache DataSketches is unavailable on big-endian targets,
/// so the Frequent Items sketch cannot be auto-enabled. The OOM path falls
/// through to returning the original error.
#[cfg(target_endian = "big")]
fn can_enable_frequent_items(_args: &Args, _user_set_sketch_method: bool) -> bool {
false
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let mut args: Args = util::get_args(USAGE, argv)?;

Expand Down Expand Up @@ -758,6 +775,18 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
args.flag_sketch_method = args.flag_sketch_method.to_lowercase();
match args.flag_sketch_method.as_str() {
"exact" => {},
// Apache DataSketches is unavailable on big-endian targets, so the
// Frequent Items sketch cannot run there. Reject upfront with a clear
// platform message instead of falling through to the normal validation.
#[cfg(target_endian = "big")]
"frequent_items" => {
return fail_incorrectusage_clierror!(
"--sketch-method frequent_items requires a little-endian target. Apache \
DataSketches is not available on big-endian platforms (e.g., s390x). Use \
--sketch-method exact."
);
},
#[cfg(not(target_endian = "big"))]
"frequent_items" => {
if args.flag_asc {
return fail_incorrectusage_clierror!(
Expand Down Expand Up @@ -884,6 +913,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
// run_frequent_items doesn't consult flag_sketch_method, so the field
// stays at "exact" and that's fine (cache key / diagnostics use other
// paths). Verified via grep over run_frequent_items's body.
//
// Apache DataSketches is unavailable on big-endian targets, so the sketch
// fallback is gated out entirely there. The big-endian branch below
// preserves the "no index → propagate the error" behavior.
#[cfg(not(target_endian = "big"))]
if matches!(e, crate::CliError::OutOfMemory(_))
&& can_enable_frequent_items(&args, user_set_sketch_method)
{
Expand All @@ -897,6 +931,17 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
} else if !index_succeeded {
return Err(e);
}
// Big-endian: no sketch auto-enable is possible (DataSketches is
// compiled out), so the only recovery left is the index that may
// have been created above. If indexing did NOT succeed, propagate
// the original error. If it DID succeed, silently fall through and
// retry with parallel processing — matching the little-endian
// behavior on the same branch (the original `else if !index_succeeded`
// arm above) where success similarly swallows `e`.
#[cfg(target_endian = "big")]
if !index_succeeded {
return Err(e);
}
},
}
}
Expand Down Expand Up @@ -3822,6 +3867,7 @@ impl Args {
/// bounded by `sketch.maximum_error()` (which equals the stream length minus the active
/// threshold). The "Other" row's count is `total_weight - sum(top_k_estimates)` and is
/// therefore approximate.
#[cfg(not(target_endian = "big"))]
#[allow(clippy::cast_precision_loss)]
fn run_frequent_items(&self, rconfig: &Config) -> CliResult<()> {
use datasketches::frequencies::{ErrorType, FrequentItemsSketch};
Expand Down
Loading
Loading