apache · Dandandan · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/datafusion-examples/examples/custom_data_source/csv_json_opener.rs b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs
@@ -80,8 +80,13 @@ async fn csv_opener() -> Result<()> {
             .create_file_opener(object_store, &scan_config, 0)?;
 
     let mut result = vec![];
-    let mut stream =
-        FileStream::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())?;
+    let mut stream = FileStream::new(
+        &scan_config,
+        0,
+        opener,
+        &ExecutionPlanMetricsSet::new(),
+        None,
+    )?;
     while let Some(batch) = stream.next().await.transpose()? {
         result.push(batch);
     }
@@ -142,6 +147,7 @@ async fn json_opener() -> Result<()> {
         0,
         Arc::new(opener),
         &ExecutionPlanMetricsSet::new(),
+        None,
     )?;
     let mut result = vec![];
     while let Some(batch) = stream.next().await.transpose()? {

diff --git a/datafusion-examples/examples/custom_data_source/custom_datasource.rs b/datafusion-examples/examples/custom_data_source/custom_datasource.rs
@@ -62,8 +62,9 @@ async fn search_accounts(
     expected_result_length: usize,
 ) -> Result<()> {
     // create local execution context
-    let ctx = SessionContext::new();
-
+    let config = SessionConfig::new()
+        .set_bool("datafusion.execution.parquet.allow_morsel_driven", false);
+    let ctx = SessionContext::new_with_config(config);
     // create logical plan composed of a single TableScan
     let logical_plan = LogicalPlanBuilder::scan_with_filters(
         "accounts",

diff --git a/datafusion-examples/examples/data_io/json_shredding.rs b/datafusion-examples/examples/data_io/json_shredding.rs
@@ -93,6 +93,7 @@ pub async fn json_shredding() -> Result<()> {
     // Set up query execution
     let mut cfg = SessionConfig::new();
     cfg.options_mut().execution.parquet.pushdown_filters = true;
-    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    // Disable morsel-driven execution because it changes how parquet pruning
+    // metrics are reported, and this example asserts on specific row group
+    // pruning statistics from EXPLAIN ANALYZE.
-    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    // Disable morsel-driven execution because it changes how parquet pruning
+    // metrics are reported, and this example asserts on specific row group
+    // pruning statistics from EXPLAIN ANALYZE.
+    cfg.options_mut().execution.parquet.allow_morsel_driven = false;
     let ctx = SessionContext::new_with_config(cfg);
     ctx.runtime_env().register_object_store(
         ObjectStoreUrl::parse("memory://")?.as_ref(),

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -743,6 +743,10 @@ config_namespace! {
         /// (reading) Use any available bloom filters when reading parquet files
         pub bloom_filter_on_read: bool, default = true
 
+        /// (reading) If true, the parquet reader will share work between partitions
+        /// using morsel-driven execution. This can help mitigate data skew.
+        pub allow_morsel_driven: bool, default = true
+
         /// (reading) The maximum predicate cache size, in bytes. When
         /// `pushdown_filters` is enabled, sets the maximum memory used to cache
         /// the results of predicate evaluation between filter evaluation and

diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs
@@ -208,6 +208,7 @@ impl ParquetOptions {
             binary_as_string: _, // not used for writer props
             coerce_int96: _,     // not used for writer props
             skip_arrow_metadata: _,
+            allow_morsel_driven: _,
             max_predicate_cache_size: _,
         } = self;
 
@@ -460,6 +461,7 @@ mod tests {
             skip_arrow_metadata: defaults.skip_arrow_metadata,
             coerce_int96: None,
             max_predicate_cache_size: defaults.max_predicate_cache_size,
+            allow_morsel_driven: defaults.allow_morsel_driven,
         }
     }
 
@@ -573,6 +575,7 @@ mod tests {
                 schema_force_view_types: global_options_defaults.schema_force_view_types,
                 binary_as_string: global_options_defaults.binary_as_string,
                 skip_arrow_metadata: global_options_defaults.skip_arrow_metadata,
+                allow_morsel_driven: global_options_defaults.allow_morsel_driven,
                 coerce_int96: None,
             },
             column_specific_options,

diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs
@@ -49,8 +49,11 @@ mod tests {
     use datafusion_common::config::TableParquetOptions;
     use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
     use datafusion_common::{Result, ScalarValue, assert_contains};
+    use datafusion_common_runtime::SpawnedTask;
     use datafusion_datasource::file_format::FileFormat;
-    use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
+    use datafusion_datasource::file_scan_config::{
+        FileScanConfig, FileScanConfigBuilder,
+    };
     use datafusion_datasource::source::DataSourceExec;
 
     use datafusion_datasource::file::FileSource;
@@ -2459,4 +2462,152 @@ mod tests {
         assert_eq!(calls.len(), 2);
         assert_eq!(calls, vec![Some(123), Some(456)]);
     }
+
+    #[tokio::test]
+    async fn parquet_morsel_driven_execution() -> Result<()> {
+        let store =
+            Arc::new(object_store::memory::InMemory::new()) as Arc<dyn ObjectStore>;
+        let store_url = ObjectStoreUrl::parse("memory://test").unwrap();
+
+        let ctx = SessionContext::new();
+        ctx.register_object_store(store_url.as_ref(), store.clone());
+
+        // Create a Parquet file with 100 row groups, each with 10 rows
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        let mut out = Vec::new();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(10)
+            .build();
+        {
+            let mut writer =
+                ArrowWriter::try_new(&mut out, Arc::clone(&schema), Some(props))?;
+            // Write many batches to ensure they are not coalesced and we can verify work distribution
+            for i in 0..100 {
+                let batch = RecordBatch::try_new(
+                    Arc::clone(&schema),
+                    vec![Arc::new(Int32Array::from(vec![i; 10]))],
+                )?;
+                writer.write(&batch)?;
+            }
+            writer.close()?;
+        }
+
+        let path = Path::from("skewed.parquet");
+        store.put(&path, out.into()).await?;
+        let meta = store.head(&path).await?;
+
+        // Set up DataSourceExec with 2 partitions, but the file is only in partition 0 (skewed)
+        let source = Arc::new(ParquetSource::new(schema));
+        let config = FileScanConfigBuilder::new(store_url, source)
+            .with_file_group(FileGroup::new(vec![PartitionedFile::new_from_meta(meta)]))
+            .with_file_group(FileGroup::new(vec![])) // Partition 1 is empty
+            .with_morsel_driven(true)
+            .build();
+
+        let exec = DataSourceExec::from_data_source(config);
+
+        // Execute both partitions concurrently
+        let task_ctx = ctx.task_ctx();
+        let stream0 = exec.execute(0, Arc::clone(&task_ctx))?;
+        let stream1 = exec.execute(1, Arc::clone(&task_ctx))?;
+
+        let handle0 = SpawnedTask::spawn(async move {
+            let mut count = 0;
+            let mut s = stream0;
+            while let Some(batch) = s.next().await {
+                count += batch.unwrap().num_rows();
+                tokio::task::yield_now().await;
+            }
+            count
+        });
+
+        let handle1 = SpawnedTask::spawn(async move {
+            let mut count = 0;
+            let mut s = stream1;
+            while let Some(batch) = s.next().await {
+                count += batch.unwrap().num_rows();
+                tokio::task::yield_now().await;
+            }
+            count
+        });
+
+        let count0 = handle0.await.unwrap();
+        let count1 = handle1.await.unwrap();
+
+        // Total rows should be 1000
+        assert_eq!(count0 + count1, 1000);
+
+        // Since it's morsel-driven, both partitions should have done some work
+        // because the work from partition 0 (the single file) was split into
+        // individual row groups and shared via the shared queue.
+        assert!(count0 > 0, "Partition 0 should have produced rows");
+        assert!(count1 > 0, "Partition 1 should have produced rows");
+
+        // Test re-executability: executing the same plan again should work
+        let stream0 = exec.execute(0, Arc::clone(&task_ctx))?;
+        let stream1 = exec.execute(1, Arc::clone(&task_ctx))?;
+
+        let mut count = 0;
+        let mut s0 = stream0;
+        while let Some(batch) = s0.next().await {
+            count += batch.unwrap().num_rows();
+        }
+        let mut s1 = stream1;
+        while let Some(batch) = s1.next().await {
+            count += batch.unwrap().num_rows();
+        }
+        assert_eq!(
+            count, 1000,
+            "Second execution should also produce 1000 rows"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn parquet_morsel_driven_enabled_by_default() -> Result<()> {
+        let tmp_dir = TempDir::new()?;
+        let path = tmp_dir.path().join("test.parquet");
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )?;
+
+        let file = File::create(&path)?;
+        let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), None)?;
+        writer.write(&batch)?;
+        writer.close()?;
+
+        let ctx = SessionContext::new();
+        ctx.register_parquet("t", path.to_str().unwrap(), ParquetReadOptions::default())
+            .await?;
+
+        let df = ctx.sql("SELECT * FROM t").await?;
+        let plan = df.create_physical_plan().await?;
+
+        // Plan should be a ProjectionExec over a DataSourceExec
+        let ds_exec = if let Some(ds) = plan.as_any().downcast_ref::<DataSourceExec>() {
+            ds
+        } else {
+            plan.children()[0]
+                .as_any()
+                .downcast_ref::<DataSourceExec>()
+                .expect("Expected DataSourceExec")
+        };
+
+        let config = ds_exec
+            .data_source()
+            .as_any()
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+
+        assert!(
+            config.morsel_driven,
+            "morsel_driven should be enabled by default for Parquet"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
@@ -227,8 +227,58 @@ impl RunQueryResult {
         format!("{}", pretty_format_batches(&self.result).unwrap())
     }
 
+    /// Extract ORDER BY column names from the query.
+    /// The query format is always:
+    ///   `SELECT * FROM test_table ORDER BY <col> <dir> <nulls>, ... LIMIT <n>`
+    fn sort_columns(&self) -> Vec<String> {
+        let order_by_start = self.query.find("ORDER BY").unwrap() + "ORDER BY".len();
+        let limit_start = self.query.rfind(" LIMIT").unwrap();
+        self.query[order_by_start..limit_start]
+            .trim()
+            .split(',')
+            .map(|part| part.split_whitespace().next().unwrap().to_string())
+            .collect()
+    }
+
+    /// Project `batches` to only include the named columns.
+    fn project_columns(batches: &[RecordBatch], cols: &[String]) -> Vec<RecordBatch> {
+        batches
+            .iter()
+            .map(|b| {
+                let schema = b.schema();
+                let indices: Vec<usize> = cols
+                    .iter()
+                    .filter_map(|c| schema.index_of(c).ok())
+                    .collect();
+                let columns: Vec<_> =
+                    indices.iter().map(|&i| Arc::clone(b.column(i))).collect();
+                let fields: Vec<_> =
+                    indices.iter().map(|&i| schema.field(i).clone()).collect();
+                let new_schema = Arc::new(Schema::new(fields));
+                RecordBatch::try_new(new_schema, columns).unwrap()
+            })
+            .collect()
+    }
+
     fn is_ok(&self) -> bool {
-        self.expected_formatted() == self.result_formatted()
+        if self.expected_formatted() == self.result_formatted() {
+            return true;
+        }
+        // If the full results differ, compare only the ORDER BY column values.
+        //
+        // For queries with ORDER BY <col> LIMIT k, multiple rows may tie on the
+        // sort key (e.g. two rows with id=27 for ORDER BY id DESC LIMIT 1).
+        // SQL permits returning any of the tied rows, so with vs without dynamic
+        // filter pushdown may legitimately return different tied rows.
+        //
+        // The dynamic filter must not change the *sort-key values* of the top-k
+        // result. We verify correctness by projecting both results down to only
+        // the ORDER BY columns and comparing those.
+        let sort_cols = self.sort_columns();
+        let expected_keys = Self::project_columns(&self.expected, &sort_cols);
+        let result_keys = Self::project_columns(&self.result, &sort_cols);
+        format!("{}", pretty_format_batches(&expected_keys).unwrap())
+            == format!("{}", pretty_format_batches(&result_keys).unwrap())
     }
 }
 

diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -382,7 +382,11 @@ async fn prune_disabled() {
     .await;
     println!("{}", output.description());
 
-    // This should not prune any
+    // Row group stats pruning is disabled, so 0 row groups are pruned by statistics.
+    // Bloom filter runs next and matches all 4 row groups (bloom filters don't help
+    // for range/inequality predicates like `nanos < threshold`). Page index pruning
+    // runs afterwards and can produce row-level selections, but those don't affect
+    // the bloom filter matched count. The query result is still correct.
     assert_eq!(output.predicate_evaluation_errors(), Some(0));
     assert_eq!(output.row_groups_matched(), Some(4));
     assert_eq!(output.row_groups_pruned(), Some(0));

diff --git a/datafusion/core/tests/physical_optimizer/partition_statistics.rs b/datafusion/core/tests/physical_optimizer/partition_statistics.rs
@@ -77,7 +77,9 @@ mod test {
         create_table_sql: Option<&str>,
         target_partition: Option<usize>,
     ) -> Arc<dyn ExecutionPlan> {
-        let mut session_config = SessionConfig::new().with_collect_statistics(true);
+        let mut session_config = SessionConfig::new()
+            .with_collect_statistics(true)
+            .set_bool("datafusion.execution.parquet.allow_morsel_driven", false);
         if let Some(partition) = target_partition {
             session_config = session_config.with_target_partitions(partition);
         }

diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs
@@ -534,6 +534,7 @@ impl FileFormat for ParquetFormat {
 
         let conf = FileScanConfigBuilder::from(conf)
             .with_source(Arc::new(source))
+            .with_morsel_driven(self.options.global.allow_morsel_driven)
             .build();
         Ok(DataSourceExec::from_data_source(conf))
     }