Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ struct ValidateArgs {
struct SchemaArgs {
/// Path to the dataset file (CSV or Parquet)
file: String,
/// Print full error chain on failure
#[arg(long)]
verbose: bool,
}

#[derive(Args)]
Expand Down Expand Up @@ -227,8 +230,13 @@ async fn main() {
}
}
Commands::Schema(args) => {
let verbose = args.verbose;
if let Err(e) = run_schema(args).await {
eprintln!("Error: {e}");
if verbose {
eprintln!("Error: {e:#}");
} else {
eprintln!("Error: {e}");
}
std::process::exit(1);
}
}
Expand Down
34 changes: 31 additions & 3 deletions src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,18 @@ pub async fn introspect(ctx: &SessionContext, table_name: &str) -> anyhow::Resul
)
.await?;

// approx_distinct uses HyperLogLog for bounded-memory cardinality estimation.
// We cast to VARCHAR first because DataFusion's HLL does not support all physical
// types that appear in parquet files (e.g. Date32, Utf8View). The cast is free for
// string-like types and produces the canonical string form for dates and timestamps,
// so distinct counts remain correct. Error is ~1% at HLL's default precision; exact
// for small cardinalities. Cast to BIGINT so run_count_sql's Int64 downcast is valid.
let unique = run_count_sql(
ctx,
&format!("SELECT COUNT(DISTINCT \"{}\") FROM {}", col, table_name),
&format!(
"SELECT CAST(approx_distinct(CAST(\"{}\" AS VARCHAR)) AS BIGINT) FROM {}",
col, table_name
),
)
.await?;

Expand Down Expand Up @@ -305,15 +314,15 @@ mod tests {

#[tokio::test]
async fn test_introspect_unique() {
// 3 distinct values
// HLL is exact at small cardinalities — 3 distinct values.
let ctx =
make_ctx("CREATE TABLE data AS SELECT * FROM (VALUES ('a'), ('b'), ('c')) AS t(v)")
.await;
let out = introspect(&ctx, "data").await.unwrap();
let col = &out.columns[0];
assert_eq!(col.unique, 3);

// 2 distinct values (one duplicate)
// 2 distinct values (one duplicate).
let ctx2 =
make_ctx("CREATE TABLE data AS SELECT * FROM (VALUES ('a'), ('b'), ('a')) AS t(v)")
.await;
Expand All @@ -322,6 +331,25 @@ mod tests {
assert_eq!(col2.unique, 2);
}

#[tokio::test]
async fn test_introspect_unique_date_column() {
// Date32 columns must not crash approx_distinct. The CAST-to-VARCHAR
// workaround handles types the HLL implementation doesn't natively support.
let ctx = make_ctx(
"CREATE TABLE data AS SELECT * FROM (\
VALUES \
(CAST('2022-01-01' AS DATE)), \
(CAST('2022-06-15' AS DATE)), \
(CAST('2022-01-01' AS DATE))\
) AS t(d)",
)
.await;
let out = introspect(&ctx, "data").await.unwrap();
let col = &out.columns[0];
// 2 distinct dates (one duplicate); HLL is exact at this cardinality.
assert_eq!(col.unique, 2);
}

#[tokio::test]
async fn test_introspect_numeric_min_max() {
let ctx =
Expand Down
Loading