Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions crates/iceberg/src/arrow/caching_delete_file_loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ impl CachingDeleteFileLoader {
PosDelLoadAction::Load => Ok(DeleteFileContext::PosDels {
file_path: task.file_path.clone(),
stream: basic_delete_file_loader
.parquet_to_batch_stream(&task.file_path)
.parquet_to_batch_stream(&task.file_path, task.file_size_in_bytes)
.await?,
}),
}
Expand All @@ -254,7 +254,7 @@ impl CachingDeleteFileLoader {
let equality_ids_vec = task.equality_ids.clone().unwrap();
let evolved_stream = BasicDeleteFileLoader::evolve_schema(
basic_delete_file_loader
.parquet_to_batch_stream(&task.file_path)
.parquet_to_batch_stream(&task.file_path, task.file_size_in_bytes)
.await?,
schema,
&equality_ids_vec,
Expand Down Expand Up @@ -614,7 +614,10 @@ mod tests {

let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone());
let record_batch_stream = basic_delete_file_loader
.parquet_to_batch_stream(&eq_delete_file_path)
.parquet_to_batch_stream(
&eq_delete_file_path,
std::fs::metadata(&eq_delete_file_path).unwrap().len(),
)
.await
.expect("could not get batch stream");

Expand Down Expand Up @@ -811,7 +814,10 @@ mod tests {
let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone());

let batch_stream = basic_delete_file_loader
.parquet_to_batch_stream(&delete_file_path)
.parquet_to_batch_stream(
&delete_file_path,
std::fs::metadata(&delete_file_path).unwrap().len(),
)
.await
.unwrap();

Expand Down Expand Up @@ -913,20 +919,23 @@ mod tests {

// Create FileScanTask with BOTH positional and equality deletes
let pos_del = FileScanTaskDeleteFile {
file_path: pos_del_path,
file_path: pos_del_path.clone(),
file_size_in_bytes: std::fs::metadata(&pos_del_path).unwrap().len(),
file_type: DataContentType::PositionDeletes,
partition_spec_id: 0,
equality_ids: None,
};

let eq_del = FileScanTaskDeleteFile {
file_path: eq_delete_path.clone(),
file_size_in_bytes: std::fs::metadata(&eq_delete_path).unwrap().len(),
file_type: DataContentType::EqualityDeletes,
partition_spec_id: 0,
equality_ids: Some(vec![2, 3]), // Only use field IDs that exist in both schemas
};

let file_scan_task = FileScanTask {
file_size_in_bytes: 0,
start: 0,
length: 0,
record_count: None,
Expand Down Expand Up @@ -993,7 +1002,7 @@ mod tests {

let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone());
let record_batch_stream = basic_delete_file_loader
.parquet_to_batch_stream(&path)
.parquet_to_batch_stream(&path, std::fs::metadata(&path).unwrap().len())
.await
.expect("could not get batch stream");

Expand Down
6 changes: 5 additions & 1 deletion crates/iceberg/src/arrow/delete_file_loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ impl BasicDeleteFileLoader {
pub(crate) async fn parquet_to_batch_stream(
&self,
data_file_path: &str,
file_size_in_bytes: u64,
) -> Result<ArrowRecordBatchStream> {
/*
Essentially a super-cut-down ArrowReader. We can't use ArrowReader directly
Expand All @@ -65,6 +66,7 @@ impl BasicDeleteFileLoader {
false,
None,
None,
file_size_in_bytes,
)
.await?
.build()?
Expand Down Expand Up @@ -102,7 +104,9 @@ impl DeleteFileLoader for BasicDeleteFileLoader {
task: &FileScanTaskDeleteFile,
schema: SchemaRef,
) -> Result<ArrowRecordBatchStream> {
let raw_batch_stream = self.parquet_to_batch_stream(&task.file_path).await?;
let raw_batch_stream = self
.parquet_to_batch_stream(&task.file_path, task.file_size_in_bytes)
.await?;

// For equality deletes, only evolve the equality_ids columns.
// For positional deletes (equality_ids is None), use all field IDs.
Expand Down
22 changes: 22 additions & 0 deletions crates/iceberg/src/arrow/delete_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,27 +378,46 @@ pub(crate) mod tests {

let pos_del_1 = FileScanTaskDeleteFile {
file_path: format!("{}/pos-del-1.parquet", table_location.to_str().unwrap()),
file_size_in_bytes: std::fs::metadata(format!(
"{}/pos-del-1.parquet",
table_location.to_str().unwrap()
))
.unwrap()
.len(),
file_type: DataContentType::PositionDeletes,
partition_spec_id: 0,
equality_ids: None,
};

let pos_del_2 = FileScanTaskDeleteFile {
file_path: format!("{}/pos-del-2.parquet", table_location.to_str().unwrap()),
file_size_in_bytes: std::fs::metadata(format!(
"{}/pos-del-2.parquet",
table_location.to_str().unwrap()
))
.unwrap()
.len(),
file_type: DataContentType::PositionDeletes,
partition_spec_id: 0,
equality_ids: None,
};

let pos_del_3 = FileScanTaskDeleteFile {
file_path: format!("{}/pos-del-3.parquet", table_location.to_str().unwrap()),
file_size_in_bytes: std::fs::metadata(format!(
"{}/pos-del-3.parquet",
table_location.to_str().unwrap()
))
.unwrap()
.len(),
file_type: DataContentType::PositionDeletes,
partition_spec_id: 0,
equality_ids: None,
};

let file_scan_tasks = vec![
FileScanTask {
file_size_in_bytes: 0,
start: 0,
length: 0,
record_count: None,
Expand All @@ -414,6 +433,7 @@ pub(crate) mod tests {
case_sensitive: false,
},
FileScanTask {
file_size_in_bytes: 0,
start: 0,
length: 0,
record_count: None,
Expand Down Expand Up @@ -464,6 +484,7 @@ pub(crate) mod tests {

// ---------- fake FileScanTask ----------
let task = FileScanTask {
file_size_in_bytes: 0,
start: 0,
length: 0,
record_count: None,
Expand All @@ -474,6 +495,7 @@ pub(crate) mod tests {
predicate: None,
deletes: vec![FileScanTaskDeleteFile {
file_path: "eq-del.parquet".to_string(),
file_size_in_bytes: 1, // never read; this test fails before opening the file
file_type: DataContentType::EqualityDeletes,
partition_spec_id: 0,
equality_ids: None,
Expand Down
Loading
Loading