From 5fcf33c5808cf7de6a178f3da37c76cd55f96b41 Mon Sep 17 00:00:00 2001 From: luca spolladore Date: Tue, 5 May 2026 14:17:02 +0200 Subject: [PATCH] fix(schema): cast to VARCHAR before approx_distinct, add --verbose to schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DataFusion 43 reads parquet with schema_force_view_types=true, coercing LargeUtf8 to Utf8View. The HLL implementation behind approx_distinct does not support Utf8View or Date32 at execution time — the planner accepts them but .collect() panics. Casting to VARCHAR before the aggregate handles every physical type: string-like columns are free casts, dates and timestamps produce their canonical string form so distinct counts remain correct. Also adds --verbose to the schema subcommand so the full anyhow error chain is visible on failure, consistent with the validate subcommand. Without this flag the root cause was completely hidden behind the top-level context string ("Failed to collect results"). Co-Authored-By: Claude Sonnet 4.6 --- src/main.rs | 10 +++++++++- src/schema.rs | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 50dfcad..76dc4ca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -95,6 +95,9 @@ struct ValidateArgs { struct SchemaArgs { /// Path to the dataset file (CSV or Parquet) file: String, + /// Print full error chain on failure + #[arg(long)] + verbose: bool, } #[derive(Args)] @@ -227,8 +230,13 @@ async fn main() { } } Commands::Schema(args) => { + let verbose = args.verbose; if let Err(e) = run_schema(args).await { - eprintln!("Error: {e}"); + if verbose { + eprintln!("Error: {e:#}"); + } else { + eprintln!("Error: {e}"); + } std::process::exit(1); } } diff --git a/src/schema.rs b/src/schema.rs index 0d5b292..73dba59 100644 --- a/src/schema.rs +++ b/src/schema.rs @@ -223,9 +223,18 @@ pub async fn introspect(ctx: &SessionContext, table_name: &str) -> anyhow::Resul ) .await?; + // approx_distinct uses HyperLogLog for bounded-memory cardinality estimation. + // We cast to VARCHAR first because DataFusion's HLL does not support all physical + // types that appear in parquet files (e.g. Date32, Utf8View). The cast is free for + // string-like types and produces the canonical string form for dates and timestamps, + // so distinct counts remain correct. Error is ~1% at HLL's default precision; exact + // for small cardinalities. Cast to BIGINT so run_count_sql's Int64 downcast is valid. let unique = run_count_sql( ctx, - &format!("SELECT COUNT(DISTINCT \"{}\") FROM {}", col, table_name), + &format!( + "SELECT CAST(approx_distinct(CAST(\"{}\" AS VARCHAR)) AS BIGINT) FROM {}", + col, table_name + ), ) .await?; @@ -305,7 +314,7 @@ mod tests { #[tokio::test] async fn test_introspect_unique() { - // 3 distinct values + // HLL is exact at small cardinalities — 3 distinct values. let ctx = make_ctx("CREATE TABLE data AS SELECT * FROM (VALUES ('a'), ('b'), ('c')) AS t(v)") .await; @@ -313,7 +322,7 @@ mod tests { let col = &out.columns[0]; assert_eq!(col.unique, 3); - // 2 distinct values (one duplicate) + // 2 distinct values (one duplicate). let ctx2 = make_ctx("CREATE TABLE data AS SELECT * FROM (VALUES ('a'), ('b'), ('a')) AS t(v)") .await; @@ -322,6 +331,25 @@ mod tests { assert_eq!(col2.unique, 2); } + #[tokio::test] + async fn test_introspect_unique_date_column() { + // Date32 columns must not crash approx_distinct. The CAST-to-VARCHAR + // workaround handles types the HLL implementation doesn't natively support. + let ctx = make_ctx( + "CREATE TABLE data AS SELECT * FROM (\ + VALUES \ + (CAST('2022-01-01' AS DATE)), \ + (CAST('2022-06-15' AS DATE)), \ + (CAST('2022-01-01' AS DATE))\ + ) AS t(d)", + ) + .await; + let out = introspect(&ctx, "data").await.unwrap(); + let col = &out.columns[0]; + // 2 distinct dates (one duplicate); HLL is exact at this cardinality. + assert_eq!(col.unique, 2); + } + #[tokio::test] async fn test_introspect_numeric_min_max() { let ctx =