-
Notifications
You must be signed in to change notification settings - Fork 176
Implement Builder for Scans on TableChanges
#521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f35d1f4
3566220
63aba2f
c5c79e6
fe8cea5
ef9ffa0
58e7cfd
da8e8b0
030c64a
3a38232
113b267
2c53852
f7c00a8
53732c4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,234 @@ | ||
| use std::sync::Arc; | ||
|
|
||
| use itertools::Itertools; | ||
| use tracing::debug; | ||
|
|
||
| use crate::scan::ColumnType; | ||
| use crate::schema::SchemaRef; | ||
| use crate::{DeltaResult, ExpressionRef}; | ||
|
|
||
| use super::{TableChanges, CDF_FIELDS}; | ||
|
|
||
| /// The result of building a [`TableChanges`] scan over a table. This can be used to get a change | ||
| /// data feed from the table | ||
| #[allow(unused)] | ||
|
zachschuermann marked this conversation as resolved.
|
||
| #[derive(Debug)] | ||
| pub struct TableChangesScan { | ||
| table_changes: Arc<TableChanges>, | ||
| logical_schema: SchemaRef, | ||
| predicate: Option<ExpressionRef>, | ||
| all_fields: Vec<ColumnType>, | ||
| have_partition_cols: bool, | ||
| } | ||
|
|
||
| /// This builder constructs a [`TableChangesScan`] that can be used to read the [`TableChanges`] | ||
| /// of a table. [`TableChangesScanBuilder`] allows you to specify a schema to project the columns | ||
| /// or specify a predicate to filter rows in the Change Data Feed. Note that predicates over Change | ||
| /// Data Feed columns `_change_type`, `_commit_version`, and `_commit_timestamp` are not currently | ||
| /// allowed. See issue [#525](https://github.com/delta-io/delta-kernel-rs/issues/525). | ||
| /// | ||
| /// Note: There is a lot of shared functionality between [`TableChangesScanBuilder`] and | ||
| /// [`ScanBuilder`]. | ||
| /// | ||
| /// [`ScanBuilder`]: crate::scan::ScanBuilder | ||
| /// #Examples | ||
| /// Construct a [`TableChangesScan`] from `table_changes` with a given schema and predicate | ||
| /// ```rust | ||
| /// # use std::sync::Arc; | ||
| /// # use delta_kernel::engine::sync::SyncEngine; | ||
| /// # use delta_kernel::expressions::{column_expr, Scalar}; | ||
| /// # use delta_kernel::{Expression, Table}; | ||
| /// # let path = "./tests/data/table-with-cdf"; | ||
| /// # let engine = Box::new(SyncEngine::new()); | ||
| /// # let table = Table::try_from_uri(path).unwrap(); | ||
| /// # let table_changes = table.table_changes(engine.as_ref(), 0, 1).unwrap(); | ||
| /// let schema = table_changes | ||
| /// .schema() | ||
| /// .project(&["id", "_commit_version"]) | ||
| /// .unwrap(); | ||
| /// let predicate = Arc::new(Expression::gt(column_expr!("id"), Scalar::from(10))); | ||
| /// let scan = table_changes | ||
| /// .into_scan_builder() | ||
| /// .with_schema(schema) | ||
| /// .with_predicate(predicate.clone()) | ||
| /// .build(); | ||
| /// ``` | ||
| #[derive(Debug)] | ||
| pub struct TableChangesScanBuilder { | ||
|
zachschuermann marked this conversation as resolved.
|
||
| table_changes: Arc<TableChanges>, | ||
| schema: Option<SchemaRef>, | ||
| predicate: Option<ExpressionRef>, | ||
| } | ||
|
|
||
| impl TableChangesScanBuilder { | ||
| /// Create a new [`TableChangesScanBuilder`] instance. | ||
| pub fn new(table_changes: impl Into<Arc<TableChanges>>) -> Self { | ||
| Self { | ||
| table_changes: table_changes.into(), | ||
| schema: None, | ||
| predicate: None, | ||
|
zachschuermann marked this conversation as resolved.
|
||
| } | ||
| } | ||
|
|
||
| /// Provide [`Schema`] for columns to select from the [`TableChanges`]. | ||
| /// | ||
| /// A table with columns `[a, b, c]` could have a scan which reads only the first | ||
| /// two columns by using the schema `[a, b]`. | ||
| /// | ||
| /// [`Schema`]: crate::schema::Schema | ||
| pub fn with_schema(mut self, schema: impl Into<Option<SchemaRef>>) -> Self { | ||
| self.schema = schema.into(); | ||
| self | ||
| } | ||
|
|
||
| /// Optionally provide an expression to filter rows. For example, using the predicate `x < | ||
| /// 4` to return a subset of the rows in the scan which satisfy the filter. If `predicate_opt` | ||
| /// is `None`, this is a no-op. | ||
| /// | ||
| /// NOTE: The filtering is best-effort and can produce false positives (rows that should should | ||
| /// have been filtered out but were kept). | ||
| pub fn with_predicate(mut self, predicate: impl Into<Option<ExpressionRef>>) -> Self { | ||
| self.predicate = predicate.into(); | ||
| self | ||
| } | ||
|
|
||
| /// Build the [`TableChangesScan`]. | ||
| /// | ||
| /// This does not scan the table at this point, but does do some work to ensure that the | ||
| /// provided schema make sense, and to prepare some metadata that the scan will need. The | ||
| /// [`TableChangesScan`] type itself can be used to fetch the files and associated metadata required to | ||
| /// perform actual data reads. | ||
| pub fn build(self) -> DeltaResult<TableChangesScan> { | ||
| // if no schema is provided, use `TableChanges`'s entire (logical) schema (e.g. SELECT *) | ||
| let logical_schema = self | ||
| .schema | ||
| .unwrap_or_else(|| self.table_changes.schema.clone().into()); | ||
| let mut have_partition_cols = false; | ||
| let mut read_fields = Vec::with_capacity(logical_schema.fields.len()); | ||
|
|
||
| // Loop over all selected fields. We produce the following: | ||
| // - If the field is read from the parquet file then it is ([`ColumnType::Selected`]). | ||
| // - If the field is a column generated by CDF, it is also ([`ColumnType::Selected`]). | ||
| // These fields will be handled separately from the other ([`ColumnType::Selected`]). | ||
| // - If the field is a partition column, it is ([`ColumnType::Partition`]). | ||
| // | ||
| // Both the partition columns and CDF generated columns will be filled in by evaluating an | ||
| // expression when transforming physical data to the logical representation. | ||
| let all_fields = logical_schema | ||
| .fields() | ||
| .enumerate() | ||
| .map(|(index, logical_field)| -> DeltaResult<_> { | ||
| if self | ||
| .table_changes | ||
| .partition_columns() | ||
| .contains(logical_field.name()) | ||
| { | ||
| // Store the index into the schema for this field. When we turn it into an | ||
| // expression in the inner loop, we will index into the schema and get the name and | ||
| // data type, which we need to properly materialize the column. | ||
| have_partition_cols = true; | ||
| Ok(ColumnType::Partition(index)) | ||
| } else if CDF_FIELDS | ||
| .iter() | ||
| .any(|field| field.name() == logical_field.name()) | ||
| { | ||
| // CDF Columns are generated, so they do not have a column mapping. These will | ||
| // be processed separately and used to build an expression when transforming physical | ||
| // data to logical. | ||
| Ok(ColumnType::Selected(logical_field.name().to_string())) | ||
| } else { | ||
| // Add to read schema, store field so we can build a `Column` expression later | ||
| // if needed (i.e. if we have partition columns) | ||
| let physical_field = | ||
| logical_field.make_physical(*self.table_changes.column_mapping_mode())?; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI this changes a lot to support column mapping and/or nested columns: #512
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm how do you think we should handle this?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an area where CDF is not different from a normal scan. We just need to leverage the same approach the other PR introduces. IMO we should NOT solve it here -- just let the other PR pick it up.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sweet, ty! |
||
| debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); | ||
| let physical_name = physical_field.name.clone(); | ||
| read_fields.push(physical_field); | ||
| Ok(ColumnType::Selected(physical_name)) | ||
| } | ||
| }) | ||
| .try_collect()?; | ||
| Ok(TableChangesScan { | ||
| table_changes: self.table_changes, | ||
| logical_schema, | ||
| predicate: self.predicate, | ||
| all_fields, | ||
| have_partition_cols, | ||
| }) | ||
| } | ||
| } | ||
| #[cfg(test)] | ||
| mod tests { | ||
|
|
||
| use std::sync::Arc; | ||
|
|
||
| use crate::engine::sync::SyncEngine; | ||
| use crate::expressions::{column_expr, Scalar}; | ||
| use crate::scan::ColumnType; | ||
| use crate::schema::{DataType, StructField, StructType}; | ||
| use crate::{Expression, Table}; | ||
|
|
||
| #[test] | ||
| fn simple_table_changes_scan_builder() { | ||
| let path = "./tests/data/table-with-cdf"; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: little bit confusing there's a column named 'part' but there are no partition columns. maybe just add a comment that this is a non-partitioned table?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added 👍 |
||
| let engine = Box::new(SyncEngine::new()); | ||
| let table = Table::try_from_uri(path).unwrap(); | ||
|
|
||
| // A field in the schema goes from being nullable to non-nullable | ||
| let table_changes = table.table_changes(engine.as_ref(), 0, 1).unwrap(); | ||
|
|
||
| let scan = table_changes.into_scan_builder().build().unwrap(); | ||
| // Note that this table is not partitioned. `part` is a regular field | ||
| assert_eq!( | ||
| scan.all_fields, | ||
| vec![ | ||
| ColumnType::Selected("part".to_string()), | ||
| ColumnType::Selected("id".to_string()), | ||
| ColumnType::Selected("_change_type".to_string()), | ||
| ColumnType::Selected("_commit_version".to_string()), | ||
| ColumnType::Selected("_commit_timestamp".to_string()), | ||
|
Comment on lines
+187
to
+189
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess these generated columns are not file-constant values, so we have to treat them like normal columns even tho they don't come from the parquet?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. I considered introducing a The current plan is to treat them as selected, and later check when transforming physical to logical:
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nicklan @zachschuermann If you feel strongly that we should do
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's keep it as-is for now. Once we add support for row tracking there will be more generated columns to worry about and we can revisit with more context available.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, and I think the expression stuff will change this as well. Rather than a "selected" field, we'll just generate an expression to say, add a column with this value.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure SGTM :) |
||
| ] | ||
| ); | ||
| assert_eq!(scan.predicate, None); | ||
| assert!(!scan.have_partition_cols); | ||
| } | ||
|
|
||
| #[test] | ||
| fn projected_and_filtered_table_changes_scan_builder() { | ||
| let path = "./tests/data/table-with-cdf"; | ||
| let engine = Box::new(SyncEngine::new()); | ||
| let table = Table::try_from_uri(path).unwrap(); | ||
|
|
||
| // A field in the schema goes from being nullable to non-nullable | ||
| let table_changes = table.table_changes(engine.as_ref(), 0, 1).unwrap(); | ||
|
|
||
| let schema = table_changes | ||
| .schema() | ||
| .project(&["id", "_commit_version"]) | ||
| .unwrap(); | ||
| let predicate = Arc::new(Expression::gt(column_expr!("id"), Scalar::from(10))); | ||
| let scan = table_changes | ||
| .into_scan_builder() | ||
| .with_schema(schema) | ||
| .with_predicate(predicate.clone()) | ||
| .build() | ||
| .unwrap(); | ||
| assert_eq!( | ||
| scan.all_fields, | ||
| vec![ | ||
| ColumnType::Selected("id".to_string()), | ||
| ColumnType::Selected("_commit_version".to_string()), | ||
| ] | ||
| ); | ||
| assert_eq!( | ||
| scan.logical_schema, | ||
| StructType::new([ | ||
| StructField::new("id", DataType::INTEGER, true), | ||
| StructField::new("_commit_version", DataType::LONG, false), | ||
| ]) | ||
| .into() | ||
| ); | ||
| assert!(!scan.have_partition_cols); | ||
| assert_eq!(scan.predicate, Some(predicate)); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.