From 0c75ddde9255a149c20847a59a2b39b7a80b5051 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Mon, 26 Aug 2024 16:18:57 +0200 Subject: [PATCH 01/23] Use Result.unwrap_or_else where applicable (#12166) --- datafusion/proto/tests/cases/roundtrip_logical_plan.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 09c5f0f8bd3d..4f58185798f7 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -104,10 +104,8 @@ fn roundtrip_json_test(_proto: &protobuf::LogicalExprNode) {} fn roundtrip_expr_test(initial_struct: Expr, ctx: SessionContext) { let extension_codec = DefaultLogicalExtensionCodec {}; let proto: protobuf::LogicalExprNode = - match serialize_expr(&initial_struct, &extension_codec) { - Ok(p) => p, - Err(e) => panic!("Error serializing expression: {:?}", e), - }; + serialize_expr(&initial_struct, &extension_codec) + .unwrap_or_else(|e| panic!("Error serializing expression: {:?}", e)); let round_trip: Expr = from_proto::parse_expr(&proto, &ctx, &extension_codec).unwrap(); From 533ddcb6f43dc1b11368bdc33924afd87f439e19 Mon Sep 17 00:00:00 2001 From: wiedld Date: Mon, 26 Aug 2024 09:01:02 -0700 Subject: [PATCH 02/23] Provide documentation of expose APIs to enable handling of type coercion at UNION plan construction. (#12142) * chore(12105): enable union type-coercion by two approaches, using newly pub interfaces * chore(12105): update documentation to delineate btwn the interfaces involved in type coercion * chore((12105): update union() logical plan construction docs, to address type coercion --- datafusion/expr/src/logical_plan/builder.rs | 12 +++++++++- .../optimizer/src/analyzer/type_coercion.rs | 22 ++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 559908bcfdfa..fdd07da023e0 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1331,7 +1331,17 @@ pub fn validate_unique_names<'a>( }) } -/// Union two logical plans. +/// Union two [`LogicalPlan`]s. +/// +/// Constructs the UNION plan, but does not perform type-coercion. Therefore the +/// subtree expressions will not be properly typed until the optimizer pass. +/// +/// If a properly typed UNION plan is needed, refer to [`TypeCoercionRewriter::coerce_union`] +/// or alternatively, merge the union input schema using [`coerce_union_schema`] and +/// apply the expression rewrite with [`coerce_plan_expr_for_schema`]. +/// +/// [`TypeCoercionRewriter::coerce_union`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/struct.TypeCoercionRewriter.html#method.coerce_union +/// [`coerce_union_schema`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/fn.coerce_union_schema.html pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result { // Temporarily use the schema from the left input and later rely on the analyzer to // coerce the two schemas into a common one. diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 68ab2e13005f..315284c50839 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -56,6 +56,8 @@ use datafusion_expr::{ Projection, ScalarUDF, Union, WindowFrame, WindowFrameBound, WindowFrameUnits, }; +/// Performs type coercion by determining the schema +/// and performing the expression rewrites. #[derive(Default)] pub struct TypeCoercion {} @@ -128,16 +130,23 @@ fn analyze_internal( .map_data(|plan| plan.recompute_schema()) } -pub(crate) struct TypeCoercionRewriter<'a> { +/// Rewrite expressions to apply type coercion. +pub struct TypeCoercionRewriter<'a> { pub(crate) schema: &'a DFSchema, } impl<'a> TypeCoercionRewriter<'a> { + /// Create a new [`TypeCoercionRewriter`] with a provided schema + /// representing both the inputs and output of the [`LogicalPlan`] node. fn new(schema: &'a DFSchema) -> Self { Self { schema } } - fn coerce_plan(&mut self, plan: LogicalPlan) -> Result { + /// Coerce the [`LogicalPlan`]. + /// + /// Refer to [`TypeCoercionRewriter::coerce_join`] and [`TypeCoercionRewriter::coerce_union`] + /// for type-coercion approach. + pub fn coerce_plan(&mut self, plan: LogicalPlan) -> Result { match plan { LogicalPlan::Join(join) => self.coerce_join(join), LogicalPlan::Union(union) => Self::coerce_union(union), @@ -153,7 +162,7 @@ impl<'a> TypeCoercionRewriter<'a> { /// /// For example, on_exprs like `t1.a = t2.b AND t1.x = t2.y` will be stored /// as a list of `(t1.a, t2.b), (t1.x, t2.y)` - fn coerce_join(&mut self, mut join: Join) -> Result { + pub fn coerce_join(&mut self, mut join: Join) -> Result { join.on = join .on .into_iter() @@ -176,7 +185,7 @@ impl<'a> TypeCoercionRewriter<'a> { /// Coerce the union’s inputs to a common schema compatible with all inputs. /// This occurs after wildcard expansion and the coercion of the input expressions. - fn coerce_union(union_plan: Union) -> Result { + pub fn coerce_union(union_plan: Union) -> Result { let union_schema = Arc::new(coerce_union_schema(&union_plan.inputs)?); let new_inputs = union_plan .inputs @@ -809,7 +818,10 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { } /// Get a common schema that is compatible with all inputs of UNION. -fn coerce_union_schema(inputs: &[Arc]) -> Result { +/// +/// This method presumes that the wildcard expansion is unneeded, or has already +/// been applied. +pub fn coerce_union_schema(inputs: &[Arc]) -> Result { let base_schema = inputs[0].schema(); let mut union_datatypes = base_schema .fields() From da3f6afec51e4272622cb74ec62a11d13df17267 Mon Sep 17 00:00:00 2001 From: Albert Skalt <133099191+askalt@users.noreply.github.com> Date: Mon, 26 Aug 2024 22:10:16 +0300 Subject: [PATCH 03/23] Add ability to return LogicalPlan by value from TableProvider (#12113) This patch changes the `get_logical_plan(...)` method signature. Now it returns a `Cow` to allow an implementation to return plan by value. --- datafusion/catalog/src/table.rs | 5 ++- datafusion/core/src/dataframe/mod.rs | 5 ++- .../core/src/datasource/cte_worktable.rs | 4 +- .../src/datasource/default_table_source.rs | 4 +- datafusion/core/src/datasource/view.rs | 6 +-- datafusion/expr/src/table_source.rs | 4 +- .../src/analyzer/inline_table_scan.rs | 43 +++++++++---------- 7 files changed, 36 insertions(+), 35 deletions(-) diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs index 792315642a00..69fa81faf8e2 100644 --- a/datafusion/catalog/src/table.rs +++ b/datafusion/catalog/src/table.rs @@ -16,6 +16,7 @@ // under the License. use std::any::Any; +use std::borrow::Cow; use std::sync::Arc; use crate::session::Session; @@ -56,8 +57,8 @@ pub trait TableProvider: Sync + Send { None } - /// Get the [`LogicalPlan`] of this table, if available - fn get_logical_plan(&self) -> Option<&LogicalPlan> { + /// Get the [`LogicalPlan`] of this table, if available. + fn get_logical_plan(&self) -> Option> { None } diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index a38e7f45a6f1..c516c7985d54 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -21,6 +21,7 @@ mod parquet; use std::any::Any; +use std::borrow::Cow; use std::collections::HashMap; use std::sync::Arc; @@ -1648,8 +1649,8 @@ impl TableProvider for DataFrameTableProvider { self } - fn get_logical_plan(&self) -> Option<&LogicalPlan> { - Some(&self.plan) + fn get_logical_plan(&self) -> Option> { + Some(Cow::Borrowed(&self.plan)) } fn supports_filters_pushdown( diff --git a/datafusion/core/src/datasource/cte_worktable.rs b/datafusion/core/src/datasource/cte_worktable.rs index d7d224828dda..d2da15c64f52 100644 --- a/datafusion/core/src/datasource/cte_worktable.rs +++ b/datafusion/core/src/datasource/cte_worktable.rs @@ -17,8 +17,8 @@ //! CteWorkTable implementation used for recursive queries -use std::any::Any; use std::sync::Arc; +use std::{any::Any, borrow::Cow}; use arrow::datatypes::SchemaRef; use async_trait::async_trait; @@ -63,7 +63,7 @@ impl TableProvider for CteWorkTable { self } - fn get_logical_plan(&self) -> Option<&LogicalPlan> { + fn get_logical_plan(&self) -> Option> { None } diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/core/src/datasource/default_table_source.rs index 977e681d6641..b4a5a76fc9ff 100644 --- a/datafusion/core/src/datasource/default_table_source.rs +++ b/datafusion/core/src/datasource/default_table_source.rs @@ -17,8 +17,8 @@ //! Default TableSource implementation used in DataFusion physical plans -use std::any::Any; use std::sync::Arc; +use std::{any::Any, borrow::Cow}; use crate::datasource::TableProvider; @@ -70,7 +70,7 @@ impl TableSource for DefaultTableSource { self.table_provider.supports_filters_pushdown(filter) } - fn get_logical_plan(&self) -> Option<&datafusion_expr::LogicalPlan> { + fn get_logical_plan(&self) -> Option> { self.table_provider.get_logical_plan() } diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index a81942bf769e..947714c1e4f9 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -17,7 +17,7 @@ //! View data source which uses a LogicalPlan as it's input. -use std::{any::Any, sync::Arc}; +use std::{any::Any, borrow::Cow, sync::Arc}; use crate::{ error::Result, @@ -90,8 +90,8 @@ impl TableProvider for ViewTable { self } - fn get_logical_plan(&self) -> Option<&LogicalPlan> { - Some(&self.logical_plan) + fn get_logical_plan(&self) -> Option> { + Some(Cow::Borrowed(&self.logical_plan)) } fn schema(&self) -> SchemaRef { diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs index 2de3cc923315..8b8d2dfcf2df 100644 --- a/datafusion/expr/src/table_source.rs +++ b/datafusion/expr/src/table_source.rs @@ -22,7 +22,7 @@ use crate::{Expr, LogicalPlan}; use arrow::datatypes::SchemaRef; use datafusion_common::{Constraints, Result}; -use std::any::Any; +use std::{any::Any, borrow::Cow}; /// Indicates how a filter expression is handled by /// [`TableProvider::scan`]. @@ -122,7 +122,7 @@ pub trait TableSource: Sync + Send { } /// Get the Logical plan of this table provider, if available. - fn get_logical_plan(&self) -> Option<&LogicalPlan> { + fn get_logical_plan(&self) -> Option> { None } diff --git a/datafusion/optimizer/src/analyzer/inline_table_scan.rs b/datafusion/optimizer/src/analyzer/inline_table_scan.rs index b69b8410da49..d5b3648725b9 100644 --- a/datafusion/optimizer/src/analyzer/inline_table_scan.rs +++ b/datafusion/optimizer/src/analyzer/inline_table_scan.rs @@ -24,7 +24,7 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{Column, Result}; use datafusion_expr::expr::WildcardOptions; -use datafusion_expr::{logical_plan::LogicalPlan, Expr, LogicalPlanBuilder, TableScan}; +use datafusion_expr::{logical_plan::LogicalPlan, Expr, LogicalPlanBuilder}; /// Analyzed rule that inlines TableScan that provide a [`LogicalPlan`] /// (DataFrame / ViewTable) @@ -56,24 +56,23 @@ fn analyze_internal(plan: LogicalPlan) -> Result> { match plan { // Match only on scans without filter / projection / fetch // Views and DataFrames won't have those added - // during the early stage of planning - LogicalPlan::TableScan(TableScan { - table_name, - source, - projection, - filters, - .. - }) if filters.is_empty() && source.get_logical_plan().is_some() => { - let sub_plan = source.get_logical_plan().unwrap(); - let projection_exprs = generate_projection_expr(&projection, sub_plan)?; - LogicalPlanBuilder::from(sub_plan.clone()) - .project(projection_exprs)? - // Ensures that the reference to the inlined table remains the - // same, meaning we don't have to change any of the parent nodes - // that reference this table. - .alias(table_name)? - .build() - .map(Transformed::yes) + // during the early stage of planning. + LogicalPlan::TableScan(table_scan) if table_scan.filters.is_empty() => { + if let Some(sub_plan) = table_scan.source.get_logical_plan() { + let sub_plan = sub_plan.into_owned(); + let projection_exprs = + generate_projection_expr(&table_scan.projection, &sub_plan)?; + LogicalPlanBuilder::from(sub_plan) + .project(projection_exprs)? + // Ensures that the reference to the inlined table remains the + // same, meaning we don't have to change any of the parent nodes + // that reference this table. + .alias(table_scan.table_name)? + .build() + .map(Transformed::yes) + } else { + Ok(Transformed::no(LogicalPlan::TableScan(table_scan))) + } } _ => Ok(Transformed::no(plan)), } @@ -104,7 +103,7 @@ fn generate_projection_expr( #[cfg(test)] mod tests { - use std::{sync::Arc, vec}; + use std::{borrow::Cow, sync::Arc, vec}; use crate::analyzer::inline_table_scan::InlineTableScan; use crate::test::assert_analyzed_plan_eq; @@ -167,8 +166,8 @@ mod tests { Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)])) } - fn get_logical_plan(&self) -> Option<&LogicalPlan> { - Some(&self.plan) + fn get_logical_plan(&self) -> Option> { + Some(Cow::Borrowed(&self.plan)) } } From 1b875f456a1d916d2c6826531dc1b3f46c42706f Mon Sep 17 00:00:00 2001 From: Emil Ejbyfeldt Date: Mon, 26 Aug 2024 21:11:41 +0200 Subject: [PATCH 04/23] Implement groups accumulator for stddev and variance (#12095) * Add more stddev/var tests cases Add tests cases for stddev_samp/pop and var_smap/pop the includes a group_by clause. * Implement GroupsAccumulator for stddev and variance * Add cast to support all numeric types * Improve documenataion and comments --- .../groups_accumulator/accumulate.rs | 268 +++++++++-------- datafusion/functions-aggregate/src/stddev.rs | 83 +++++- .../functions-aggregate/src/variance.rs | 275 ++++++++++++++++-- .../sqllogictest/test_files/aggregate.slt | 79 +++++ 4 files changed, 555 insertions(+), 150 deletions(-) diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs index 455fc5fec450..a0475fe8e446 100644 --- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs +++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs @@ -91,36 +91,9 @@ impl NullState { /// * `opt_filter`: if present, only rows for which is Some(true) are included /// * `value_fn`: function invoked for (group_index, value) where value is non null /// - /// # Example + /// See [`accumulate`], for more details on how value_fn is called /// - /// ```text - /// ┌─────────┐ ┌─────────┐ ┌ ─ ─ ─ ─ ┐ - /// │ ┌─────┐ │ │ ┌─────┐ │ ┌─────┐ - /// │ │ 2 │ │ │ │ 200 │ │ │ │ t │ │ - /// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ - /// │ │ 2 │ │ │ │ 100 │ │ │ │ f │ │ - /// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ - /// │ │ 0 │ │ │ │ 200 │ │ │ │ t │ │ - /// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ - /// │ │ 1 │ │ │ │ 200 │ │ │ │NULL │ │ - /// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ - /// │ │ 0 │ │ │ │ 300 │ │ │ │ t │ │ - /// │ └─────┘ │ │ └─────┘ │ └─────┘ - /// └─────────┘ └─────────┘ └ ─ ─ ─ ─ ┘ - /// - /// group_indices values opt_filter - /// ``` - /// - /// In the example above, `value_fn` is invoked for each (group_index, - /// value) pair where `opt_filter[i]` is true and values is non null - /// - /// ```text - /// value_fn(2, 200) - /// value_fn(0, 200) - /// value_fn(0, 300) - /// ``` - /// - /// It also sets + /// When value_fn is called it also sets /// /// 1. `self.seen_values[group_index]` to true for all rows that had a non null vale pub fn accumulate( @@ -134,105 +107,14 @@ impl NullState { T: ArrowPrimitiveType + Send, F: FnMut(usize, T::Native) + Send, { - let data: &[T::Native] = values.values(); - assert_eq!(data.len(), group_indices.len()); - // ensure the seen_values is big enough (start everything at // "not seen" valid) let seen_values = initialize_builder(&mut self.seen_values, total_num_groups, false); - - match (values.null_count() > 0, opt_filter) { - // no nulls, no filter, - (false, None) => { - let iter = group_indices.iter().zip(data.iter()); - for (&group_index, &new_value) in iter { - seen_values.set_bit(group_index, true); - value_fn(group_index, new_value); - } - } - // nulls, no filter - (true, None) => { - let nulls = values.nulls().unwrap(); - // This is based on (ahem, COPY/PASTE) arrow::compute::aggregate::sum - // iterate over in chunks of 64 bits for more efficient null checking - let group_indices_chunks = group_indices.chunks_exact(64); - let data_chunks = data.chunks_exact(64); - let bit_chunks = nulls.inner().bit_chunks(); - - let group_indices_remainder = group_indices_chunks.remainder(); - let data_remainder = data_chunks.remainder(); - - group_indices_chunks - .zip(data_chunks) - .zip(bit_chunks.iter()) - .for_each(|((group_index_chunk, data_chunk), mask)| { - // index_mask has value 1 << i in the loop - let mut index_mask = 1; - group_index_chunk.iter().zip(data_chunk.iter()).for_each( - |(&group_index, &new_value)| { - // valid bit was set, real value - let is_valid = (mask & index_mask) != 0; - if is_valid { - seen_values.set_bit(group_index, true); - value_fn(group_index, new_value); - } - index_mask <<= 1; - }, - ) - }); - - // handle any remaining bits (after the initial 64) - let remainder_bits = bit_chunks.remainder_bits(); - group_indices_remainder - .iter() - .zip(data_remainder.iter()) - .enumerate() - .for_each(|(i, (&group_index, &new_value))| { - let is_valid = remainder_bits & (1 << i) != 0; - if is_valid { - seen_values.set_bit(group_index, true); - value_fn(group_index, new_value); - } - }); - } - // no nulls, but a filter - (false, Some(filter)) => { - assert_eq!(filter.len(), group_indices.len()); - // The performance with a filter could be improved by - // iterating over the filter in chunks, rather than a single - // iterator. TODO file a ticket - group_indices - .iter() - .zip(data.iter()) - .zip(filter.iter()) - .for_each(|((&group_index, &new_value), filter_value)| { - if let Some(true) = filter_value { - seen_values.set_bit(group_index, true); - value_fn(group_index, new_value); - } - }) - } - // both null values and filters - (true, Some(filter)) => { - assert_eq!(filter.len(), group_indices.len()); - // The performance with a filter could be improved by - // iterating over the filter in chunks, rather than using - // iterators. TODO file a ticket - filter - .iter() - .zip(group_indices.iter()) - .zip(values.iter()) - .for_each(|((filter_value, &group_index), new_value)| { - if let Some(true) = filter_value { - if let Some(new_value) = new_value { - seen_values.set_bit(group_index, true); - value_fn(group_index, new_value) - } - } - }) - } - } + accumulate(group_indices, values, opt_filter, |group_index, value| { + seen_values.set_bit(group_index, true); + value_fn(group_index, value); + }); } /// Invokes `value_fn(group_index, value)` for each non null, non @@ -351,6 +233,144 @@ impl NullState { } } +/// Invokes `value_fn(group_index, value)` for each non null, non +/// filtered value of `value`, +/// +/// # Arguments: +/// +/// * `group_indices`: To which groups do the rows in `values` belong, (aka group_index) +/// * `values`: the input arguments to the accumulator +/// * `opt_filter`: if present, only rows for which is Some(true) are included +/// * `value_fn`: function invoked for (group_index, value) where value is non null +/// +/// # Example +/// +/// ```text +/// ┌─────────┐ ┌─────────┐ ┌ ─ ─ ─ ─ ┐ +/// │ ┌─────┐ │ │ ┌─────┐ │ ┌─────┐ +/// │ │ 2 │ │ │ │ 200 │ │ │ │ t │ │ +/// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ +/// │ │ 2 │ │ │ │ 100 │ │ │ │ f │ │ +/// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ +/// │ │ 0 │ │ │ │ 200 │ │ │ │ t │ │ +/// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ +/// │ │ 1 │ │ │ │ 200 │ │ │ │NULL │ │ +/// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ +/// │ │ 0 │ │ │ │ 300 │ │ │ │ t │ │ +/// │ └─────┘ │ │ └─────┘ │ └─────┘ +/// └─────────┘ └─────────┘ └ ─ ─ ─ ─ ┘ +/// +/// group_indices values opt_filter +/// ``` +/// +/// In the example above, `value_fn` is invoked for each (group_index, +/// value) pair where `opt_filter[i]` is true and values is non null +/// +/// ```text +/// value_fn(2, 200) +/// value_fn(0, 200) +/// value_fn(0, 300) +/// ``` +pub fn accumulate( + group_indices: &[usize], + values: &PrimitiveArray, + opt_filter: Option<&BooleanArray>, + mut value_fn: F, +) where + T: ArrowPrimitiveType + Send, + F: FnMut(usize, T::Native) + Send, +{ + let data: &[T::Native] = values.values(); + assert_eq!(data.len(), group_indices.len()); + + match (values.null_count() > 0, opt_filter) { + // no nulls, no filter, + (false, None) => { + let iter = group_indices.iter().zip(data.iter()); + for (&group_index, &new_value) in iter { + value_fn(group_index, new_value); + } + } + // nulls, no filter + (true, None) => { + let nulls = values.nulls().unwrap(); + // This is based on (ahem, COPY/PASTE) arrow::compute::aggregate::sum + // iterate over in chunks of 64 bits for more efficient null checking + let group_indices_chunks = group_indices.chunks_exact(64); + let data_chunks = data.chunks_exact(64); + let bit_chunks = nulls.inner().bit_chunks(); + + let group_indices_remainder = group_indices_chunks.remainder(); + let data_remainder = data_chunks.remainder(); + + group_indices_chunks + .zip(data_chunks) + .zip(bit_chunks.iter()) + .for_each(|((group_index_chunk, data_chunk), mask)| { + // index_mask has value 1 << i in the loop + let mut index_mask = 1; + group_index_chunk.iter().zip(data_chunk.iter()).for_each( + |(&group_index, &new_value)| { + // valid bit was set, real value + let is_valid = (mask & index_mask) != 0; + if is_valid { + value_fn(group_index, new_value); + } + index_mask <<= 1; + }, + ) + }); + + // handle any remaining bits (after the initial 64) + let remainder_bits = bit_chunks.remainder_bits(); + group_indices_remainder + .iter() + .zip(data_remainder.iter()) + .enumerate() + .for_each(|(i, (&group_index, &new_value))| { + let is_valid = remainder_bits & (1 << i) != 0; + if is_valid { + value_fn(group_index, new_value); + } + }); + } + // no nulls, but a filter + (false, Some(filter)) => { + assert_eq!(filter.len(), group_indices.len()); + // The performance with a filter could be improved by + // iterating over the filter in chunks, rather than a single + // iterator. TODO file a ticket + group_indices + .iter() + .zip(data.iter()) + .zip(filter.iter()) + .for_each(|((&group_index, &new_value), filter_value)| { + if let Some(true) = filter_value { + value_fn(group_index, new_value); + } + }) + } + // both null values and filters + (true, Some(filter)) => { + assert_eq!(filter.len(), group_indices.len()); + // The performance with a filter could be improved by + // iterating over the filter in chunks, rather than using + // iterators. TODO file a ticket + filter + .iter() + .zip(group_indices.iter()) + .zip(values.iter()) + .for_each(|((filter_value, &group_index), new_value)| { + if let Some(true) = filter_value { + if let Some(new_value) = new_value { + value_fn(group_index, new_value) + } + } + }) + } + } +} + /// This function is called to update the accumulator state per row /// when the value is not needed (e.g. COUNT) /// diff --git a/datafusion/functions-aggregate/src/stddev.rs b/datafusion/functions-aggregate/src/stddev.rs index 180f4ad3cf37..3534fb5b4d26 100644 --- a/datafusion/functions-aggregate/src/stddev.rs +++ b/datafusion/functions-aggregate/src/stddev.rs @@ -19,17 +19,21 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; +use std::sync::Arc; +use arrow::array::Float64Array; use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field}; use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_common::{plan_err, ScalarValue}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, GroupsAccumulator, Signature, Volatility, +}; use datafusion_functions_aggregate_common::stats::StatsType; -use crate::variance::VarianceAccumulator; +use crate::variance::{VarianceAccumulator, VarianceGroupsAccumulator}; make_udaf_expr_and_func!( Stddev, @@ -118,6 +122,17 @@ impl AggregateUDFImpl for Stddev { fn aliases(&self) -> &[String] { &self.alias } + + fn groups_accumulator_supported(&self, acc_args: AccumulatorArgs) -> bool { + !acc_args.is_distinct + } + + fn create_groups_accumulator( + &self, + _args: AccumulatorArgs, + ) -> Result> { + Ok(Box::new(StddevGroupsAccumulator::new(StatsType::Sample))) + } } make_udaf_expr_and_func!( @@ -201,6 +216,19 @@ impl AggregateUDFImpl for StddevPop { Ok(DataType::Float64) } + + fn groups_accumulator_supported(&self, acc_args: AccumulatorArgs) -> bool { + !acc_args.is_distinct + } + + fn create_groups_accumulator( + &self, + _args: AccumulatorArgs, + ) -> Result> { + Ok(Box::new(StddevGroupsAccumulator::new( + StatsType::Population, + ))) + } } /// An accumulator to compute the average @@ -267,6 +295,57 @@ impl Accumulator for StddevAccumulator { } } +#[derive(Debug)] +pub struct StddevGroupsAccumulator { + variance: VarianceGroupsAccumulator, +} + +impl StddevGroupsAccumulator { + pub fn new(s_type: StatsType) -> Self { + Self { + variance: VarianceGroupsAccumulator::new(s_type), + } + } +} + +impl GroupsAccumulator for StddevGroupsAccumulator { + fn update_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&arrow::array::BooleanArray>, + total_num_groups: usize, + ) -> Result<()> { + self.variance + .update_batch(values, group_indices, opt_filter, total_num_groups) + } + + fn merge_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&arrow::array::BooleanArray>, + total_num_groups: usize, + ) -> Result<()> { + self.variance + .merge_batch(values, group_indices, opt_filter, total_num_groups) + } + + fn evaluate(&mut self, emit_to: datafusion_expr::EmitTo) -> Result { + let (mut variances, nulls) = self.variance.variance(emit_to); + variances.iter_mut().for_each(|v| *v = v.sqrt()); + Ok(Arc::new(Float64Array::new(variances.into(), Some(nulls)))) + } + + fn state(&mut self, emit_to: datafusion_expr::EmitTo) -> Result> { + self.variance.state(emit_to) + } + + fn size(&self) -> usize { + self.variance.size() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/functions-aggregate/src/variance.rs b/datafusion/functions-aggregate/src/variance.rs index 4c78a42ea494..f5f2d06e3837 100644 --- a/datafusion/functions-aggregate/src/variance.rs +++ b/datafusion/functions-aggregate/src/variance.rs @@ -18,10 +18,11 @@ //! [`VarianceSample`]: variance sample aggregations. //! [`VariancePopulation`]: variance population aggregations. -use std::fmt::Debug; +use std::{fmt::Debug, sync::Arc}; use arrow::{ - array::{ArrayRef, Float64Array, UInt64Array}, + array::{Array, ArrayRef, BooleanArray, Float64Array, UInt64Array}, + buffer::NullBuffer, compute::kernels::cast, datatypes::{DataType, Field}, }; @@ -32,9 +33,11 @@ use datafusion_common::{ use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, utils::format_state_name, - Accumulator, AggregateUDFImpl, Signature, Volatility, + Accumulator, AggregateUDFImpl, GroupsAccumulator, Signature, Volatility, +}; +use datafusion_functions_aggregate_common::{ + aggregate::groups_accumulator::accumulate::accumulate, stats::StatsType, }; -use datafusion_functions_aggregate_common::stats::StatsType; make_udaf_expr_and_func!( VarianceSample, @@ -122,6 +125,17 @@ impl AggregateUDFImpl for VarianceSample { fn aliases(&self) -> &[String] { &self.aliases } + + fn groups_accumulator_supported(&self, acc_args: AccumulatorArgs) -> bool { + !acc_args.is_distinct + } + + fn create_groups_accumulator( + &self, + _args: AccumulatorArgs, + ) -> Result> { + Ok(Box::new(VarianceGroupsAccumulator::new(StatsType::Sample))) + } } pub struct VariancePopulation { @@ -196,6 +210,19 @@ impl AggregateUDFImpl for VariancePopulation { fn aliases(&self) -> &[String] { &self.aliases } + + fn groups_accumulator_supported(&self, acc_args: AccumulatorArgs) -> bool { + !acc_args.is_distinct + } + + fn create_groups_accumulator( + &self, + _args: AccumulatorArgs, + ) -> Result> { + Ok(Box::new(VarianceGroupsAccumulator::new( + StatsType::Population, + ))) + } } /// An accumulator to compute variance @@ -239,6 +266,36 @@ impl VarianceAccumulator { } } +#[inline] +fn merge( + count: u64, + mean: f64, + m2: f64, + count2: u64, + mean2: f64, + m22: f64, +) -> (u64, f64, f64) { + let new_count = count + count2; + let new_mean = + mean * count as f64 / new_count as f64 + mean2 * count2 as f64 / new_count as f64; + let delta = mean - mean2; + let new_m2 = + m2 + m22 + delta * delta * count as f64 * count2 as f64 / new_count as f64; + + (new_count, new_mean, new_m2) +} + +#[inline] +fn update(count: u64, mean: f64, m2: f64, value: f64) -> (u64, f64, f64) { + let new_count = count + 1; + let delta1 = value - mean; + let new_mean = delta1 / new_count as f64 + mean; + let delta2 = value - new_mean; + let new_m2 = m2 + delta1 * delta2; + + (new_count, new_mean, new_m2) +} + impl Accumulator for VarianceAccumulator { fn state(&mut self) -> Result> { Ok(vec![ @@ -253,15 +310,8 @@ impl Accumulator for VarianceAccumulator { let arr = downcast_value!(values, Float64Array).iter().flatten(); for value in arr { - let new_count = self.count + 1; - let delta1 = value - self.mean; - let new_mean = delta1 / new_count as f64 + self.mean; - let delta2 = value - new_mean; - let new_m2 = self.m2 + delta1 * delta2; - - self.count += 1; - self.mean = new_mean; - self.m2 = new_m2; + (self.count, self.mean, self.m2) = + update(self.count, self.mean, self.m2, value) } Ok(()) @@ -296,17 +346,14 @@ impl Accumulator for VarianceAccumulator { if c == 0_u64 { continue; } - let new_count = self.count + c; - let new_mean = self.mean * self.count as f64 / new_count as f64 - + means.value(i) * c as f64 / new_count as f64; - let delta = self.mean - means.value(i); - let new_m2 = self.m2 - + m2s.value(i) - + delta * delta * self.count as f64 * c as f64 / new_count as f64; - - self.count = new_count; - self.mean = new_mean; - self.m2 = new_m2; + (self.count, self.mean, self.m2) = merge( + self.count, + self.mean, + self.m2, + c, + means.value(i), + m2s.value(i), + ) } Ok(()) } @@ -344,3 +391,183 @@ impl Accumulator for VarianceAccumulator { true } } + +#[derive(Debug)] +pub struct VarianceGroupsAccumulator { + m2s: Vec, + means: Vec, + counts: Vec, + stats_type: StatsType, +} + +impl VarianceGroupsAccumulator { + pub fn new(s_type: StatsType) -> Self { + Self { + m2s: Vec::new(), + means: Vec::new(), + counts: Vec::new(), + stats_type: s_type, + } + } + + fn resize(&mut self, total_num_groups: usize) { + self.m2s.resize(total_num_groups, 0.0); + self.means.resize(total_num_groups, 0.0); + self.counts.resize(total_num_groups, 0); + } + + fn merge( + group_indices: &[usize], + counts: &UInt64Array, + means: &Float64Array, + m2s: &Float64Array, + opt_filter: Option<&BooleanArray>, + mut value_fn: F, + ) where + F: FnMut(usize, u64, f64, f64) + Send, + { + assert_eq!(counts.null_count(), 0); + assert_eq!(means.null_count(), 0); + assert_eq!(m2s.null_count(), 0); + + match opt_filter { + None => { + group_indices + .iter() + .zip(counts.values().iter()) + .zip(means.values().iter()) + .zip(m2s.values().iter()) + .for_each(|(((&group_index, &count), &mean), &m2)| { + value_fn(group_index, count, mean, m2); + }); + } + Some(filter) => { + group_indices + .iter() + .zip(counts.values().iter()) + .zip(means.values().iter()) + .zip(m2s.values().iter()) + .zip(filter.iter()) + .for_each( + |((((&group_index, &count), &mean), &m2), filter_value)| { + if let Some(true) = filter_value { + value_fn(group_index, count, mean, m2); + } + }, + ); + } + } + } + + pub fn variance( + &mut self, + emit_to: datafusion_expr::EmitTo, + ) -> (Vec, NullBuffer) { + let mut counts = emit_to.take_needed(&mut self.counts); + // means are only needed for updating m2s and are not needed for the final result. + // But we still need to take them to ensure the internal state is consistent. + let _ = emit_to.take_needed(&mut self.means); + let m2s = emit_to.take_needed(&mut self.m2s); + + if let StatsType::Sample = self.stats_type { + counts.iter_mut().for_each(|count| { + *count -= 1; + }); + } + let nulls = NullBuffer::from_iter(counts.iter().map(|&count| count != 0)); + let variance = m2s + .iter() + .zip(counts) + .map(|(m2, count)| m2 / count as f64) + .collect(); + (variance, nulls) + } +} + +impl GroupsAccumulator for VarianceGroupsAccumulator { + fn update_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&arrow::array::BooleanArray>, + total_num_groups: usize, + ) -> Result<()> { + assert_eq!(values.len(), 1, "single argument to update_batch"); + let values = &cast(&values[0], &DataType::Float64)?; + let values = downcast_value!(values, Float64Array); + + self.resize(total_num_groups); + accumulate(group_indices, values, opt_filter, |group_index, value| { + let (new_count, new_mean, new_m2) = update( + self.counts[group_index], + self.means[group_index], + self.m2s[group_index], + value, + ); + self.counts[group_index] = new_count; + self.means[group_index] = new_mean; + self.m2s[group_index] = new_m2; + }); + Ok(()) + } + + fn merge_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&arrow::array::BooleanArray>, + total_num_groups: usize, + ) -> Result<()> { + assert_eq!(values.len(), 3, "two arguments to merge_batch"); + // first batch is counts, second is partial means, third is partial m2s + let partial_counts = downcast_value!(values[0], UInt64Array); + let partial_means = downcast_value!(values[1], Float64Array); + let partial_m2s = downcast_value!(values[2], Float64Array); + + self.resize(total_num_groups); + Self::merge( + group_indices, + partial_counts, + partial_means, + partial_m2s, + opt_filter, + |group_index, partial_count, partial_mean, partial_m2| { + let (new_count, new_mean, new_m2) = merge( + self.counts[group_index], + self.means[group_index], + self.m2s[group_index], + partial_count, + partial_mean, + partial_m2, + ); + self.counts[group_index] = new_count; + self.means[group_index] = new_mean; + self.m2s[group_index] = new_m2; + }, + ); + Ok(()) + } + + fn evaluate(&mut self, emit_to: datafusion_expr::EmitTo) -> Result { + let (variances, nulls) = self.variance(emit_to); + Ok(Arc::new(Float64Array::new(variances.into(), Some(nulls)))) + } + + fn state(&mut self, emit_to: datafusion_expr::EmitTo) -> Result> { + let counts = emit_to.take_needed(&mut self.counts); + let means = emit_to.take_needed(&mut self.means); + let m2s = emit_to.take_needed(&mut self.m2s); + + Ok(vec![ + Arc::new(UInt64Array::new(counts.into(), None)), + Arc::new(Float64Array::new(means.into(), None)), + Arc::new(Float64Array::new(m2s.into(), None)), + ]) + } + + fn size(&self) -> usize { + self.m2s.capacity() * std::mem::size_of::() + + self.means.capacity() * std::mem::size_of::() + + self.counts.capacity() * std::mem::size_of::() + } +} diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index b826dd31f4d1..45cb4d4615d7 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -511,6 +511,85 @@ select stddev(sq.column1) from (values (1.1), (2.0), (3.0)) as sq ---- 0.950438495292 +# csv_query_stddev_7 +query IR +SELECT c2, stddev_samp(c12) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.303641032262 +2 0.284581967411 +3 0.296002660506 +4 0.284324609109 +5 0.331034486752 + +# csv_query_stddev_8 +query IR +SELECT c2, stddev_pop(c12) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.296659845456 +2 0.278038978602 +3 0.288107833475 +4 0.278074953424 +5 0.318992813225 + +# csv_query_stddev_9 +query IR +SELECT c2, var_pop(c12) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.088007063906 +2 0.077305673622 +3 0.083006123709 +4 0.077325679722 +5 0.101756414889 + +# csv_query_stddev_10 +query IR +SELECT c2, var_samp(c12) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.092197876473 +2 0.080986896176 +3 0.087617575027 +4 0.080840483345 +5 0.109583831419 + +# csv_query_stddev_11 +query IR +SELECT c2, var_samp(c12) FROM aggregate_test_100 WHERE c12 > 0.90 GROUP BY c2 ORDER BY c2 +---- +1 0.000889240174 +2 0.000785878272 +3 NULL +4 NULL +5 0.000269544643 + +# Use PostgresSQL dialect +statement ok +set datafusion.sql_parser.dialect = 'Postgres'; + +# csv_query_stddev_12 +query IR +SELECT c2, var_samp(c12) FILTER (WHERE c12 > 0.90) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.000889240174 +2 0.000785878272 +3 NULL +4 NULL +5 0.000269544643 + +# Restore the default dialect +statement ok +set datafusion.sql_parser.dialect = 'Generic'; + +# csv_query_stddev_13 +query IR +SELECT c2, var_samp(CASE WHEN c12 > 0.90 THEN c12 ELSE null END) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.000889240174 +2 0.000785878272 +3 NULL +4 NULL +5 0.000269544643 + + # csv_query_approx_median_1 query I SELECT approx_median(c2) FROM aggregate_test_100 From 7d49fb308d638bef2c1ed1fa96fc87e3f740c72e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 26 Aug 2024 15:12:23 -0400 Subject: [PATCH 05/23] Minor: refine Partitioning documentation (#12145) --- datafusion/expr/src/logical_plan/plan.rs | 6 +++++- datafusion/physical-expr/src/partitioning.rs | 6 ++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 3ede7f25b753..d6574040c596 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2932,7 +2932,11 @@ impl Debug for Subquery { } } -/// Logical partitioning schemes supported by the repartition operator. +/// Logical partitioning schemes supported by [`LogicalPlan::Repartition`] +/// +/// See [`Partitioning`] for more details on partitioning +/// +/// [`Partitioning`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/enum.Partitioning.html# #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum Partitioning { /// Allocate batches using a round-robin algorithm and the specified number of partitions diff --git a/datafusion/physical-expr/src/partitioning.rs b/datafusion/physical-expr/src/partitioning.rs index 6472dd47489c..45beeb7b81af 100644 --- a/datafusion/physical-expr/src/partitioning.rs +++ b/datafusion/physical-expr/src/partitioning.rs @@ -24,8 +24,8 @@ use crate::{physical_exprs_equal, EquivalenceProperties, PhysicalExpr}; /// Output partitioning supported by [`ExecutionPlan`]s. /// -/// When `executed`, `ExecutionPlan`s produce one or more independent stream of -/// data batches in parallel, referred to as partitions. The streams are Rust +/// Calling [`ExecutionPlan::execute`] produce one or more independent streams of +/// [`RecordBatch`]es in parallel, referred to as partitions. The streams are Rust /// `async` [`Stream`]s (a special kind of future). The number of output /// partitions varies based on the input and the operation performed. /// @@ -102,6 +102,8 @@ use crate::{physical_exprs_equal, EquivalenceProperties, PhysicalExpr}; /// Plans such as `FilterExec` produce the same number of output streams /// (partitions) as input streams (partitions). /// +/// [`RecordBatch`]: arrow::record_batch::RecordBatch +/// [`ExecutionPlan::execute`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#tymethod.execute /// [`ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html /// [`Stream`]: https://docs.rs/futures/latest/futures/stream/trait.Stream.html #[derive(Debug, Clone)] From ed12f113cb1d33a1eea80c41787a93bc52f196c7 Mon Sep 17 00:00:00 2001 From: theirix Date: Mon, 26 Aug 2024 20:13:34 +0100 Subject: [PATCH 06/23] Minor: allow to build RuntimeEnv from RuntimeConfig (#12151) * Allow to build RuntimeEnv from RuntimeConfig * Fix formatting --- datafusion/core/tests/fuzz_cases/sort_fuzz.rs | 9 +++++---- datafusion/execution/src/runtime_env.rs | 5 +++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index f4b4f16aa160..eda306dd3de5 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -22,7 +22,7 @@ use arrow::{ compute::SortOptions, record_batch::RecordBatch, }; -use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::runtime_env::RuntimeConfig; use datafusion::physical_plan::expressions::PhysicalSortExpr; use datafusion::physical_plan::memory::MemoryExec; use datafusion::physical_plan::sorts::sort::SortExec; @@ -136,9 +136,10 @@ impl SortTest { .sort_spill_reservation_bytes, ); - let runtime_config = RuntimeConfig::new() - .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))); - let runtime = Arc::new(RuntimeEnv::new(runtime_config).unwrap()); + let runtime_env = RuntimeConfig::new() + .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))) + .build(); + let runtime = Arc::new(runtime_env.unwrap()); SessionContext::new_with_config_rt(session_config, runtime) } else { SessionContext::new_with_config(session_config) diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs index 420246595558..bddce81f537b 100644 --- a/datafusion/execution/src/runtime_env.rs +++ b/datafusion/execution/src/runtime_env.rs @@ -228,4 +228,9 @@ impl RuntimeConfig { pub fn with_temp_file_path(self, path: impl Into) -> Self { self.with_disk_manager(DiskManagerConfig::new_specified(vec![path.into()])) } + + /// Build a `RuntimeEnv` object from the configuration + pub fn build(self) -> Result { + RuntimeEnv::new(self) + } } From 55a1459b123c0bb16e2cbe41cc59cf3582115932 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Mon, 26 Aug 2024 12:19:27 -0700 Subject: [PATCH 07/23] benches: add lower benches (#12152) --- datafusion/functions/benches/lower.rs | 92 ++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/benches/lower.rs b/datafusion/functions/benches/lower.rs index fa963f174e46..934c1c6bd189 100644 --- a/datafusion/functions/benches/lower.rs +++ b/datafusion/functions/benches/lower.rs @@ -17,8 +17,10 @@ extern crate criterion; -use arrow::array::{ArrayRef, StringArray}; -use arrow::util::bench_util::create_string_array_with_len; +use arrow::array::{ArrayRef, StringArray, StringViewBuilder}; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_expr::ColumnarValue; use datafusion_functions::string; @@ -65,6 +67,58 @@ fn create_args3(size: usize) -> Vec { vec![ColumnarValue::Array(array)] } +/// Create an array of args containing StringViews, where all the values in the +/// StringViews are ASCII. +/// * `size` - the length of the StringViews, and +/// * `str_len` - the length of the strings within the array. +/// * `null_density` - the density of null values in the array. +/// * `mixed` - whether the array is mixed between inlined and referenced strings. +fn create_args4( + size: usize, + str_len: usize, + null_density: f32, + mixed: bool, +) -> Vec { + let array = Arc::new(create_string_view_array_with_len( + size, + null_density, + str_len, + mixed, + )); + + vec![ColumnarValue::Array(array)] +} + +/// Create an array of args containing a StringViewArray, where some of the values in the +/// array are non-ASCII. +/// * `size` - the length of the StringArray, and +/// * `non_ascii_density` - the density of non-ASCII values in the array. +/// * `null_density` - the density of null values in the array. +fn create_args5( + size: usize, + non_ascii_density: f32, + null_density: f32, +) -> Vec { + let mut string_view_builder = StringViewBuilder::with_capacity(size); + for _ in 0..size { + // sample null_density to determine if the value should be null + if rand::random::() < null_density { + string_view_builder.append_null(); + continue; + } + + // sample non_ascii_density to determine if the value should be non-ASCII + if rand::random::() < non_ascii_density { + string_view_builder.append_value("农历新年农历新年农历新年农历新年农历新年"); + } else { + string_view_builder.append_value("DATAFUSIONDATAFUSIONDATAFUSION"); + } + } + + let array = Arc::new(string_view_builder.finish()) as ArrayRef; + vec![ColumnarValue::Array(array)] +} + fn criterion_benchmark(c: &mut Criterion) { let lower = string::lower(); for size in [1024, 4096, 8192] { @@ -85,6 +139,40 @@ fn criterion_benchmark(c: &mut Criterion) { |b| b.iter(|| black_box(lower.invoke(&args))), ); } + + let sizes = [4096, 8192]; + let str_lens = [10, 64, 128]; + let mixes = [true, false]; + let null_densities = [0.0f32, 0.1f32]; + + for null_density in &null_densities { + for &mixed in &mixes { + for &str_len in &str_lens { + for &size in &sizes { + let args = create_args4(size, str_len, *null_density, mixed); + c.bench_function( + &format!("lower_all_values_are_ascii_string_views: size: {}, str_len: {}, null_density: {}, mixed: {}", + size, str_len, null_density, mixed), + |b| b.iter(|| black_box(lower.invoke(&args))), + ); + + let args = create_args4(size, str_len, *null_density, mixed); + c.bench_function( + &format!("lower_all_values_are_ascii_string_views: size: {}, str_len: {}, null_density: {}, mixed: {}", + size, str_len, null_density, mixed), + |b| b.iter(|| black_box(lower.invoke(&args))), + ); + + let args = create_args5(size, 0.1, *null_density); + c.bench_function( + &format!("lower_some_values_are_nonascii_string_views: size: {}, str_len: {}, non_ascii_density: {}, null_density: {}, mixed: {}", + size, str_len, 0.1, null_density, mixed), + |b| b.iter(|| black_box(lower.invoke(&args))), + ); + } + } + } + } } criterion_group!(benches, criterion_benchmark); From 0f96af5b500efff72314f840a59a736787cc3def Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Mon, 26 Aug 2024 21:24:51 +0200 Subject: [PATCH 08/23] Replace Arc::try_unwrap with Arc::unwrap_or_clone where cloning anyway (#12173) --- datafusion/core/src/execution/context/mod.rs | 2 +- datafusion/expr/src/expr_fn.rs | 8 ++++---- datafusion/expr/src/logical_plan/plan.rs | 5 +---- datafusion/sql/src/unparser/rewrite.rs | 5 +---- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 49cbf9ebde9d..c2a072eef756 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -688,7 +688,7 @@ impl SessionContext { column_defaults, } = cmd; - let input = Arc::try_unwrap(input).unwrap_or_else(|e| e.as_ref().clone()); + let input = Arc::unwrap_or_clone(input); let input = self.state().optimize(&input)?; let table = self.table(name.clone()).await; match (if_not_exists, or_replace, table) { diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 4e6022399653..1e0b601146dd 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -394,7 +394,7 @@ pub fn create_udf( volatility: Volatility, fun: ScalarFunctionImplementation, ) -> ScalarUDF { - let return_type = Arc::try_unwrap(return_type).unwrap_or_else(|t| t.as_ref().clone()); + let return_type = Arc::unwrap_or_clone(return_type); ScalarUDF::from(SimpleScalarUDF::new( name, input_types, @@ -476,8 +476,8 @@ pub fn create_udaf( accumulator: AccumulatorFactoryFunction, state_type: Arc>, ) -> AggregateUDF { - let return_type = Arc::try_unwrap(return_type).unwrap_or_else(|t| t.as_ref().clone()); - let state_type = Arc::try_unwrap(state_type).unwrap_or_else(|t| t.as_ref().clone()); + let return_type = Arc::unwrap_or_clone(return_type); + let state_type = Arc::unwrap_or_clone(state_type); let state_fields = state_type .into_iter() .enumerate() @@ -594,7 +594,7 @@ pub fn create_udwf( volatility: Volatility, partition_evaluator_factory: PartitionEvaluatorFactory, ) -> WindowUDF { - let return_type = Arc::try_unwrap(return_type).unwrap_or_else(|t| t.as_ref().clone()); + let return_type = Arc::unwrap_or_clone(return_type); WindowUDF::from(SimpleWindowUDF::new( name, input_type, diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index d6574040c596..096b65896b69 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1160,10 +1160,7 @@ impl LogicalPlan { Ok(if let LogicalPlan::Prepare(prepare_lp) = plan_with_values { param_values.verify(&prepare_lp.data_types)?; // try and take ownership of the input if is not shared, clone otherwise - match Arc::try_unwrap(prepare_lp.input) { - Ok(input) => input, - Err(arc_input) => arc_input.as_ref().clone(), - } + Arc::unwrap_or_clone(prepare_lp.input) } else { plan_with_values }) diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs index f6725485f920..fed3f2c39f9d 100644 --- a/datafusion/sql/src/unparser/rewrite.rs +++ b/datafusion/sql/src/unparser/rewrite.rs @@ -59,10 +59,7 @@ pub(super) fn normalize_union_schema(plan: &LogicalPlan) -> Result let transformed_plan = plan.transform_up(|plan| match plan { LogicalPlan::Union(mut union) => { - let schema = match Arc::try_unwrap(union.schema) { - Ok(inner) => inner, - Err(schema) => (*schema).clone(), - }; + let schema = Arc::unwrap_or_clone(union.schema); let schema = schema.strip_qualifiers(); union.schema = Arc::new(schema); From dff590bfd2bb9993b2c8ce6f76a3bdd973e520a8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 26 Aug 2024 13:57:17 -0700 Subject: [PATCH 09/23] fix: Skip buffered rows which are not joined with streamed side when checking join filter results (#12159) --- datafusion/physical-plan/src/joins/sort_merge_join.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index 56cd699bf6e7..511cb4c55fcd 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -1474,6 +1474,12 @@ impl SMJStream { [chunk.buffered_batch_idx.unwrap()]; for i in 0..pre_mask.len() { + // If the buffered row is not joined with streamed side, + // skip it. + if buffered_indices.is_null(i) { + continue; + } + let buffered_index = buffered_indices.value(i); buffered_batch.join_filter_failed_map.insert( From 1e67cb6239dbaa896344335678356c24ef1df509 Mon Sep 17 00:00:00 2001 From: Jax Liu Date: Tue, 27 Aug 2024 08:22:42 +0800 Subject: [PATCH 10/23] enable the null map tests (#12176) --- datafusion/sqllogictest/test_files/map.slt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index b7a0a74913b0..270e4beccc52 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -376,11 +376,10 @@ SELECT MAP {'a':1, 'b':2, 'c':3 }['a'] FROM t; 1 1 -# TODO(https://github.com/sqlparser-rs/sqlparser-rs/pull/1361): support parsing an empty map. Enable this after upgrading sqlparser-rs. -# query ? -# SELECT MAP {}; -# ---- -# {} +query ? +SELECT MAP {}; +---- +{} # values contain null query ? From a723b79eca998dfa61cd2aefa122779336b153c7 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 27 Aug 2024 03:34:30 +0200 Subject: [PATCH 11/23] Remove unwrap_arc helper (#12172) It can now be replaced with single call `Arc::unwrap_or_clone`, with added bonus of slightly better name. --- datafusion/expr/src/logical_plan/builder.rs | 37 +++++++++++-------- datafusion/expr/src/logical_plan/plan.rs | 3 +- datafusion/expr/src/logical_plan/tree_node.rs | 11 +----- .../optimizer/src/analyzer/type_coercion.rs | 18 ++++++--- .../optimizer/src/common_subexpr_eliminate.rs | 11 +++--- .../src/decorrelate_predicate_subquery.rs | 9 +++-- .../optimizer/src/eliminate_cross_join.rs | 27 +++++++++++--- datafusion/optimizer/src/eliminate_filter.rs | 3 +- datafusion/optimizer/src/eliminate_limit.rs | 6 ++- .../optimizer/src/eliminate_nested_union.rs | 14 +++---- .../optimizer/src/eliminate_one_union.rs | 10 +++-- .../optimizer/src/eliminate_outer_join.rs | 3 +- .../optimizer/src/optimize_projections/mod.rs | 14 +++---- .../optimizer/src/propagate_empty_relation.rs | 3 +- datafusion/optimizer/src/push_down_filter.rs | 23 ++++++------ datafusion/optimizer/src/push_down_limit.rs | 3 +- datafusion/sql/src/select.rs | 10 ++--- 17 files changed, 110 insertions(+), 95 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index fdd07da023e0..6abcfe4279bc 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -41,9 +41,8 @@ use crate::utils::{ find_valid_equijoin_key_pair, group_window_expr_by_sort_keys, }; use crate::{ - and, binary_expr, logical_plan::tree_node::unwrap_arc, DmlStatement, Expr, - ExprSchemable, Operator, RecursiveQuery, TableProviderFilterPushDown, TableSource, - WriteOp, + and, binary_expr, DmlStatement, Expr, ExprSchemable, Operator, RecursiveQuery, + TableProviderFilterPushDown, TableSource, WriteOp, }; use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; @@ -376,7 +375,7 @@ impl LogicalPlanBuilder { self, expr: impl IntoIterator>, ) -> Result { - project(unwrap_arc(self.plan), expr).map(Self::new) + project(Arc::unwrap_or_clone(self.plan), expr).map(Self::new) } /// Select the given column indices @@ -429,7 +428,7 @@ impl LogicalPlanBuilder { /// Apply an alias pub fn alias(self, alias: impl Into) -> Result { - subquery_alias(unwrap_arc(self.plan), alias).map(Self::new) + subquery_alias(Arc::unwrap_or_clone(self.plan), alias).map(Self::new) } /// Add missing sort columns to all downstream projection @@ -484,7 +483,7 @@ impl LogicalPlanBuilder { Self::ambiguous_distinct_check(&missing_exprs, missing_cols, &expr)?; } expr.extend(missing_exprs); - project(unwrap_arc(input), expr) + project(Arc::unwrap_or_clone(input), expr) } _ => { let is_distinct = @@ -580,8 +579,11 @@ impl LogicalPlanBuilder { let new_expr = schema.columns().into_iter().map(Expr::Column).collect(); let is_distinct = false; - let plan = - Self::add_missing_columns(unwrap_arc(self.plan), &missing_cols, is_distinct)?; + let plan = Self::add_missing_columns( + Arc::unwrap_or_clone(self.plan), + &missing_cols, + is_distinct, + )?; let sort_plan = LogicalPlan::Sort(Sort { expr: normalize_cols(exprs, &plan)?, input: Arc::new(plan), @@ -595,12 +597,12 @@ impl LogicalPlanBuilder { /// Apply a union, preserving duplicate rows pub fn union(self, plan: LogicalPlan) -> Result { - union(unwrap_arc(self.plan), plan).map(Self::new) + union(Arc::unwrap_or_clone(self.plan), plan).map(Self::new) } /// Apply a union, removing duplicate rows pub fn union_distinct(self, plan: LogicalPlan) -> Result { - let left_plan: LogicalPlan = unwrap_arc(self.plan); + let left_plan: LogicalPlan = Arc::unwrap_or_clone(self.plan); let right_plan: LogicalPlan = plan; Ok(Self::new(LogicalPlan::Distinct(Distinct::All(Arc::new( @@ -1064,7 +1066,7 @@ impl LogicalPlanBuilder { /// Build the plan pub fn build(self) -> Result { - Ok(unwrap_arc(self.plan)) + Ok(Arc::unwrap_or_clone(self.plan)) } /// Apply a join with the expression on constraint. @@ -1138,7 +1140,7 @@ impl LogicalPlanBuilder { /// Unnest the given column. pub fn unnest_column(self, column: impl Into) -> Result { - unnest(unwrap_arc(self.plan), vec![column.into()]).map(Self::new) + unnest(Arc::unwrap_or_clone(self.plan), vec![column.into()]).map(Self::new) } /// Unnest the given column given [`UnnestOptions`] @@ -1147,8 +1149,12 @@ impl LogicalPlanBuilder { column: impl Into, options: UnnestOptions, ) -> Result { - unnest_with_options(unwrap_arc(self.plan), vec![column.into()], options) - .map(Self::new) + unnest_with_options( + Arc::unwrap_or_clone(self.plan), + vec![column.into()], + options, + ) + .map(Self::new) } /// Unnest the given columns with the given [`UnnestOptions`] @@ -1157,7 +1163,8 @@ impl LogicalPlanBuilder { columns: Vec, options: UnnestOptions, ) -> Result { - unnest_with_options(unwrap_arc(self.plan), columns, options).map(Self::new) + unnest_with_options(Arc::unwrap_or_clone(self.plan), columns, options) + .map(Self::new) } } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 096b65896b69..359de2d30a57 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -51,7 +51,6 @@ use datafusion_common::{ // backwards compatibility use crate::display::PgJsonVisitor; -use crate::logical_plan::tree_node::unwrap_arc; pub use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; pub use datafusion_common::{JoinConstraint, JoinType}; @@ -770,7 +769,7 @@ impl LogicalPlan { .. }) => { // Update schema with unnested column type. - unnest_with_options(unwrap_arc(input), exec_columns, options) + unnest_with_options(Arc::unwrap_or_clone(input), exec_columns, options) } } } diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 539cb1cf5fb2..273404c8df31 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -379,21 +379,12 @@ impl TreeNode for LogicalPlan { } } -/// Converts a `Arc` without copying, if possible. Copies the plan -/// if there is a shared reference -pub fn unwrap_arc(plan: Arc) -> LogicalPlan { - Arc::try_unwrap(plan) - // if None is returned, there is another reference to this - // LogicalPlan, so we can not own it, and must clone instead - .unwrap_or_else(|node| node.as_ref().clone()) -} - /// Applies `f` to rewrite a `Arc` without copying, if possible fn rewrite_arc Result>>( plan: Arc, mut f: F, ) -> Result>> { - f(unwrap_arc(plan))?.map_data(|new_plan| Ok(Arc::new(new_plan))) + f(Arc::unwrap_or_clone(plan))?.map_data(|new_plan| Ok(Arc::new(new_plan))) } /// rewrite a `Vec` of `Arc` without copying, if possible diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 315284c50839..a6b9bad6c5d9 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -37,7 +37,6 @@ use datafusion_expr::expr::{ }; use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema; use datafusion_expr::expr_schema::cast_subquery; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::Subquery; use datafusion_expr::type_coercion::binary::{ comparison_coercion, get_input_types, like_coercion, @@ -250,15 +249,19 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { subquery, outer_ref_columns, }) => { - let new_plan = analyze_internal(self.schema, unwrap_arc(subquery))?.data; + let new_plan = + analyze_internal(self.schema, Arc::unwrap_or_clone(subquery))?.data; Ok(Transformed::yes(Expr::ScalarSubquery(Subquery { subquery: Arc::new(new_plan), outer_ref_columns, }))) } Expr::Exists(Exists { subquery, negated }) => { - let new_plan = - analyze_internal(self.schema, unwrap_arc(subquery.subquery))?.data; + let new_plan = analyze_internal( + self.schema, + Arc::unwrap_or_clone(subquery.subquery), + )? + .data; Ok(Transformed::yes(Expr::Exists(Exists { subquery: Subquery { subquery: Arc::new(new_plan), @@ -272,8 +275,11 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { subquery, negated, }) => { - let new_plan = - analyze_internal(self.schema, unwrap_arc(subquery.subquery))?.data; + let new_plan = analyze_internal( + self.schema, + Arc::unwrap_or_clone(subquery.subquery), + )? + .data; let expr_type = expr.get_type(self.schema)?; let subquery_type = new_plan.schema().field(0).data_type(); let common_type = comparison_coercion(&expr_type, subquery_type).ok_or(plan_datafusion_err!( diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index feccf5679efb..3fcee4123b76 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -33,7 +33,6 @@ use datafusion_common::tree_node::{ }; use datafusion_common::{qualified_name, Column, DFSchema, DFSchemaRef, Result}; use datafusion_expr::expr::{Alias, ScalarFunction}; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::{ Aggregate, Filter, LogicalPlan, Projection, Sort, Window, }; @@ -314,7 +313,7 @@ impl CommonSubexprEliminate { schema, .. } = projection; - let input = unwrap_arc(input); + let input = Arc::unwrap_or_clone(input); self.try_unary_plan(expr, input, config)? .map_data(|(new_expr, new_input)| { Projection::try_new_with_schema(new_expr, Arc::new(new_input), schema) @@ -327,7 +326,7 @@ impl CommonSubexprEliminate { config: &dyn OptimizerConfig, ) -> Result> { let Sort { expr, input, fetch } = sort; - let input = unwrap_arc(input); + let input = Arc::unwrap_or_clone(input); let new_sort = self.try_unary_plan(expr, input, config)?.update_data( |(new_expr, new_input)| { LogicalPlan::Sort(Sort { @@ -348,7 +347,7 @@ impl CommonSubexprEliminate { let Filter { predicate, input, .. } = filter; - let input = unwrap_arc(input); + let input = Arc::unwrap_or_clone(input); let expr = vec![predicate]; self.try_unary_plan(expr, input, config)? .map_data(|(mut new_expr, new_input)| { @@ -458,7 +457,7 @@ impl CommonSubexprEliminate { schema, .. } = aggregate; - let input = unwrap_arc(input); + let input = Arc::unwrap_or_clone(input); // Extract common sub-expressions from the aggregate and grouping expressions. self.find_common_exprs(vec![group_expr, aggr_expr], config, ExprMask::Normal)? .map_data(|common| { @@ -729,7 +728,7 @@ fn get_consecutive_window_exprs( window_expr_list.push(window_expr); window_schemas.push(schema); - plan = unwrap_arc(input); + plan = Arc::unwrap_or_clone(input); } (window_expr_list, window_schemas, plan) } diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index b6d49490d437..f1cae1099a4d 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -37,7 +37,6 @@ use datafusion_expr::{ LogicalPlan, LogicalPlanBuilder, Operator, }; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use log::debug; /// Optimizer rule for rewriting predicate(IN/EXISTS) subquery to left semi/anti joins @@ -55,8 +54,10 @@ impl DecorrelatePredicateSubquery { mut subquery: Subquery, config: &dyn OptimizerConfig, ) -> Result { - subquery.subquery = - Arc::new(self.rewrite(unwrap_arc(subquery.subquery), config)?.data); + subquery.subquery = Arc::new( + self.rewrite(Arc::unwrap_or_clone(subquery.subquery), config)? + .data, + ); Ok(subquery) } @@ -164,7 +165,7 @@ impl OptimizerRule for DecorrelatePredicateSubquery { } // iterate through all exists clauses in predicate, turning each into a join - let mut cur_input = unwrap_arc(input); + let mut cur_input = Arc::unwrap_or_clone(input); for subquery in subqueries { if let Some(plan) = build_join(&subquery, &cur_input, config.alias_generator())? diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs index fc4eaef80903..20e6641e4d62 100644 --- a/datafusion/optimizer/src/eliminate_cross_join.rs +++ b/datafusion/optimizer/src/eliminate_cross_join.rs @@ -24,7 +24,6 @@ use crate::join_key_set::JoinKeySet; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{internal_err, Result}; use datafusion_expr::expr::{BinaryExpr, Expr}; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::{ CrossJoin, Filter, Join, JoinConstraint, JoinType, LogicalPlan, Projection, }; @@ -114,7 +113,7 @@ impl OptimizerRule for EliminateCrossJoin { input, predicate, .. } = filter; flatten_join_inputs( - unwrap_arc(input), + Arc::unwrap_or_clone(input), &mut possible_join_keys, &mut all_inputs, )?; @@ -217,12 +216,28 @@ fn flatten_join_inputs( ); } possible_join_keys.insert_all_owned(join.on); - flatten_join_inputs(unwrap_arc(join.left), possible_join_keys, all_inputs)?; - flatten_join_inputs(unwrap_arc(join.right), possible_join_keys, all_inputs)?; + flatten_join_inputs( + Arc::unwrap_or_clone(join.left), + possible_join_keys, + all_inputs, + )?; + flatten_join_inputs( + Arc::unwrap_or_clone(join.right), + possible_join_keys, + all_inputs, + )?; } LogicalPlan::CrossJoin(join) => { - flatten_join_inputs(unwrap_arc(join.left), possible_join_keys, all_inputs)?; - flatten_join_inputs(unwrap_arc(join.right), possible_join_keys, all_inputs)?; + flatten_join_inputs( + Arc::unwrap_or_clone(join.left), + possible_join_keys, + all_inputs, + )?; + flatten_join_inputs( + Arc::unwrap_or_clone(join.right), + possible_join_keys, + all_inputs, + )?; } _ => { all_inputs.push(plan); diff --git a/datafusion/optimizer/src/eliminate_filter.rs b/datafusion/optimizer/src/eliminate_filter.rs index 84bb8e782142..bb2b4547e9c2 100644 --- a/datafusion/optimizer/src/eliminate_filter.rs +++ b/datafusion/optimizer/src/eliminate_filter.rs @@ -19,7 +19,6 @@ use datafusion_common::tree_node::Transformed; use datafusion_common::{Result, ScalarValue}; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::{EmptyRelation, Expr, Filter, LogicalPlan}; use std::sync::Arc; @@ -65,7 +64,7 @@ impl OptimizerRule for EliminateFilter { input, .. }) => match v { - Some(true) => Ok(Transformed::yes(unwrap_arc(input))), + Some(true) => Ok(Transformed::yes(Arc::unwrap_or_clone(input))), Some(false) | None => Ok(Transformed::yes(LogicalPlan::EmptyRelation( EmptyRelation { produce_one_row: false, diff --git a/datafusion/optimizer/src/eliminate_limit.rs b/datafusion/optimizer/src/eliminate_limit.rs index a42fe6a6f95b..e48f37a77cd3 100644 --- a/datafusion/optimizer/src/eliminate_limit.rs +++ b/datafusion/optimizer/src/eliminate_limit.rs @@ -20,7 +20,7 @@ use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::Transformed; use datafusion_common::Result; -use datafusion_expr::logical_plan::{tree_node::unwrap_arc, EmptyRelation, LogicalPlan}; +use datafusion_expr::logical_plan::{EmptyRelation, LogicalPlan}; use std::sync::Arc; /// Optimizer rule to replace `LIMIT 0` or `LIMIT` whose ancestor LIMIT's skip is @@ -74,7 +74,9 @@ impl OptimizerRule for EliminateLimit { } } else if limit.skip == 0 { // input also can be Limit, so we should apply again. - return Ok(self.rewrite(unwrap_arc(limit.input), _config).unwrap()); + return Ok(self + .rewrite(Arc::unwrap_or_clone(limit.input), _config) + .unwrap()); } Ok(Transformed::no(LogicalPlan::Limit(limit))) } diff --git a/datafusion/optimizer/src/eliminate_nested_union.rs b/datafusion/optimizer/src/eliminate_nested_union.rs index 5d7895bba4d8..e9b38567a982 100644 --- a/datafusion/optimizer/src/eliminate_nested_union.rs +++ b/datafusion/optimizer/src/eliminate_nested_union.rs @@ -21,7 +21,6 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::Transformed; use datafusion_common::Result; use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::{Distinct, LogicalPlan, Union}; use itertools::Itertools; use std::sync::Arc; @@ -69,7 +68,7 @@ impl OptimizerRule for EliminateNestedUnion { }))) } LogicalPlan::Distinct(Distinct::All(nested_plan)) => { - match unwrap_arc(nested_plan) { + match Arc::unwrap_or_clone(nested_plan) { LogicalPlan::Union(Union { inputs, schema }) => { let inputs = inputs .into_iter() @@ -96,16 +95,17 @@ impl OptimizerRule for EliminateNestedUnion { } fn extract_plans_from_union(plan: Arc) -> Vec { - match unwrap_arc(plan) { - LogicalPlan::Union(Union { inputs, .. }) => { - inputs.into_iter().map(unwrap_arc).collect::>() - } + match Arc::unwrap_or_clone(plan) { + LogicalPlan::Union(Union { inputs, .. }) => inputs + .into_iter() + .map(Arc::unwrap_or_clone) + .collect::>(), plan => vec![plan], } } fn extract_plan_from_distinct(plan: Arc) -> Arc { - match unwrap_arc(plan) { + match Arc::unwrap_or_clone(plan) { LogicalPlan::Distinct(Distinct::All(plan)) => plan, plan => Arc::new(plan), } diff --git a/datafusion/optimizer/src/eliminate_one_union.rs b/datafusion/optimizer/src/eliminate_one_union.rs index 43024107c4f8..383b33637f6f 100644 --- a/datafusion/optimizer/src/eliminate_one_union.rs +++ b/datafusion/optimizer/src/eliminate_one_union.rs @@ -16,9 +16,11 @@ // under the License. //! [`EliminateOneUnion`] eliminates single element `Union` + use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{tree_node::Transformed, Result}; -use datafusion_expr::logical_plan::{tree_node::unwrap_arc, LogicalPlan, Union}; +use datafusion_expr::logical_plan::{LogicalPlan, Union}; +use std::sync::Arc; use crate::optimizer::ApplyOrder; @@ -48,9 +50,9 @@ impl OptimizerRule for EliminateOneUnion { _config: &dyn OptimizerConfig, ) -> Result> { match plan { - LogicalPlan::Union(Union { mut inputs, .. }) if inputs.len() == 1 => { - Ok(Transformed::yes(unwrap_arc(inputs.pop().unwrap()))) - } + LogicalPlan::Union(Union { mut inputs, .. }) if inputs.len() == 1 => Ok( + Transformed::yes(Arc::unwrap_or_clone(inputs.pop().unwrap())), + ), _ => Ok(Transformed::no(plan)), } } diff --git a/datafusion/optimizer/src/eliminate_outer_join.rs b/datafusion/optimizer/src/eliminate_outer_join.rs index 12534e058152..e7c88df55122 100644 --- a/datafusion/optimizer/src/eliminate_outer_join.rs +++ b/datafusion/optimizer/src/eliminate_outer_join.rs @@ -18,7 +18,6 @@ //! [`EliminateOuterJoin`] converts `LEFT/RIGHT/FULL` joins to `INNER` joins use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{Column, DFSchema, Result}; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::{Join, JoinType, LogicalPlan}; use datafusion_expr::{Expr, Filter, Operator}; @@ -79,7 +78,7 @@ impl OptimizerRule for EliminateOuterJoin { _config: &dyn OptimizerConfig, ) -> Result> { match plan { - LogicalPlan::Filter(mut filter) => match unwrap_arc(filter.input) { + LogicalPlan::Filter(mut filter) => match Arc::unwrap_or_clone(filter.input) { LogicalPlan::Join(join) => { let mut non_nullable_cols: Vec = vec![]; diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index ac4ed87a4a1a..5db5afd11062 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -41,7 +41,6 @@ use crate::utils::NamePreserver; use datafusion_common::tree_node::{ Transformed, TreeNode, TreeNodeIterator, TreeNodeRecursion, }; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; /// Optimizer rule to prune unnecessary columns from intermediate schemas /// inside the [`LogicalPlan`]. This rule: @@ -181,7 +180,7 @@ fn optimize_projections( let necessary_exprs = necessary_indices.get_required_exprs(schema); return optimize_projections( - unwrap_arc(aggregate.input), + Arc::unwrap_or_clone(aggregate.input), config, necessary_indices, )? @@ -221,7 +220,7 @@ fn optimize_projections( child_reqs.with_exprs(&input_schema, &new_window_expr)?; return optimize_projections( - unwrap_arc(window.input), + Arc::unwrap_or_clone(window.input), config, required_indices.clone(), )? @@ -488,7 +487,7 @@ fn merge_consecutive_projections(proj: Projection) -> Result { let parents_predicates = split_conjunction_owned(filter.predicate); @@ -1139,19 +1138,19 @@ fn convert_to_cross_join_if_beneficial( match plan { // Can be converted back to cross join LogicalPlan::Join(join) if join.on.is_empty() && join.filter.is_none() => { - LogicalPlanBuilder::from(unwrap_arc(join.left)) - .cross_join(unwrap_arc(join.right))? + LogicalPlanBuilder::from(Arc::unwrap_or_clone(join.left)) + .cross_join(Arc::unwrap_or_clone(join.right))? .build() .map(Transformed::yes) } - LogicalPlan::Filter(filter) => convert_to_cross_join_if_beneficial(unwrap_arc( - filter.input, - ))? - .transform_data(|child_plan| { - Filter::try_new(filter.predicate, Arc::new(child_plan)) - .map(LogicalPlan::Filter) - .map(Transformed::yes) - }), + LogicalPlan::Filter(filter) => { + convert_to_cross_join_if_beneficial(Arc::unwrap_or_clone(filter.input))? + .transform_data(|child_plan| { + Filter::try_new(filter.predicate, Arc::new(child_plan)) + .map(LogicalPlan::Filter) + .map(Transformed::yes) + }) + } plan => Ok(Transformed::no(plan)), } } diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 290b893577b8..dff0b61c6b22 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -26,7 +26,6 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::Transformed; use datafusion_common::utils::combine_limit; use datafusion_common::Result; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan}; /// Optimization rule that tries to push down `LIMIT`. @@ -83,7 +82,7 @@ impl OptimizerRule for PushDownLimit { }))); }; - match unwrap_arc(input) { + match Arc::unwrap_or_clone(input) { LogicalPlan::TableScan(mut scan) => { let rows_needed = if fetch != 0 { fetch + skip } else { 0 }; let new_fetch = scan diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 45fda094557b..384893bfa94c 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -33,7 +33,6 @@ use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; use datafusion_expr::expr_rewriter::{ normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols, }; -use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::utils::{ expr_as_column_expr, expr_to_columns, find_aggregate_exprs, find_window_exprs, }; @@ -361,9 +360,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .build() } LogicalPlan::Filter(mut filter) => { - filter.input = Arc::new( - self.try_process_aggregate_unnest(unwrap_arc(filter.input))?, - ); + filter.input = + Arc::new(self.try_process_aggregate_unnest(Arc::unwrap_or_clone( + filter.input, + ))?); Ok(LogicalPlan::Filter(filter)) } _ => Ok(input), @@ -401,7 +401,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Projection: tab.array_col AS unnest(tab.array_col) // TableScan: tab // ``` - let mut intermediate_plan = unwrap_arc(input); + let mut intermediate_plan = Arc::unwrap_or_clone(input); let mut intermediate_select_exprs = group_expr; loop { From 9ef2f95241fc761929b71bdf9148ad221a6f34c2 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 27 Aug 2024 03:37:09 +0200 Subject: [PATCH 12/23] Fix typo (#12169) --- datafusion/expr/src/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 11a244a944f8..8f04d3ec3067 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -463,7 +463,7 @@ pub fn expand_qualified_wildcard( /// if bool is true SortExpr comes from `PARTITION BY` column, if false comes from `ORDER BY` column type WindowSortKey = Vec<(Expr, bool)>; -/// Generate a sort key for a given window expr's partition_by and order_bu expr +/// Generate a sort key for a given window expr's partition_by and order_by expr pub fn generate_sort_key( partition_by: &[Expr], order_by: &[Expr], From 7e9ea3ad59071d56093c197c5ecd5c50021deb94 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 26 Aug 2024 21:38:43 -0400 Subject: [PATCH 13/23] Minor: remove vestigal github workflow (#12182) --- .github/workflows/pr_comment.yml | 53 -------------------------------- 1 file changed, 53 deletions(-) delete mode 100644 .github/workflows/pr_comment.yml diff --git a/.github/workflows/pr_comment.yml b/.github/workflows/pr_comment.yml deleted file mode 100644 index 8b6df1c75687..000000000000 --- a/.github/workflows/pr_comment.yml +++ /dev/null @@ -1,53 +0,0 @@ -# Downloads any `message` artifacts created by other jobs -# and posts them as comments to the PR -name: PR Comment - -on: - workflow_run: - workflows: ["Benchmarks"] - types: - - completed - -jobs: - comment: - name: PR Comment - runs-on: ubuntu-latest - if: github.event.workflow_run.conclusion == 'success' - steps: - - name: Dump GitHub context - env: - GITHUB_CONTEXT: ${{ toJSON(github) }} - run: echo "$GITHUB_CONTEXT" - - - name: Download comment message - uses: actions/download-artifact@v4 - with: - name: message - run-id: ${{ github.event.workflow_run.id }} - github-token: ${{ secrets.GITHUB_TOKEN }} - - - name: Download pr number - uses: actions/download-artifact@v4 - with: - name: pr - run-id: ${{ github.event.workflow_run.id }} - github-token: ${{ secrets.GITHUB_TOKEN }} - - - name: Print message and pr number - run: | - cat pr - echo "PR_NUMBER=$(cat pr)" >> "$GITHUB_ENV" - cat message.md - - - name: Post the comment - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - const content = fs.readFileSync('message.md', 'utf8'); - github.rest.issues.createComment({ - issue_number: process.env.PR_NUMBER, - owner: context.repo.owner, - repo: context.repo.repo, - body: content, - }) From 2ac0842954a1ee5605be54810ed668e63379dfdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=9E=97=E4=BC=9F?= Date: Tue, 27 Aug 2024 21:09:58 +0800 Subject: [PATCH 14/23] Remove `AggregateExpr` trait (#12096) * save * Move AggregateFunctionExpr into physical-expr crate * Move AggregateExpr trait into physical-expr crate * Remove AggregateExpr trait * Fix doc and tests * Remove empty crate physical-expr-functions-aggregate * Use func name instead of expr name * Fix tests * Fix tests and clippy lints * Cargo update for datafusion-cli * Add docs * Add functions back * Fix doc --- Cargo.toml | 2 - datafusion-cli/Cargo.lock | 160 ++++++---- datafusion/core/Cargo.toml | 1 - datafusion/core/src/lib.rs | 6 - .../combine_partial_final_agg.rs | 13 +- .../physical_optimizer/update_aggr_exprs.rs | 9 +- datafusion/core/src/physical_planner.rs | 10 +- datafusion/core/src/test_util/mod.rs | 8 +- .../core/tests/fuzz_cases/aggregate_fuzz.rs | 2 +- .../src/aggregate.rs | 167 ---------- .../functions-aggregate-common/src/utils.rs | 21 +- .../Cargo.toml | 48 --- .../src/lib.rs | 20 -- .../src/aggregate.rs | 294 ++++++++++++------ datafusion/physical-expr/src/lib.rs | 25 +- .../physical-expr/src/window/aggregate.rs | 11 +- .../src/window/sliding_aggregate.rs | 13 +- .../src/aggregate_statistics.rs | 38 +-- datafusion/physical-plan/Cargo.toml | 1 - .../physical-plan/src/aggregates/mod.rs | 110 +++---- .../physical-plan/src/aggregates/row_hash.rs | 9 +- .../physical-plan/src/execution_plan.rs | 2 +- datafusion/physical-plan/src/lib.rs | 4 +- datafusion/physical-plan/src/windows/mod.rs | 7 +- datafusion/proto/src/physical_plan/mod.rs | 7 +- .../proto/src/physical_plan/to_proto.rs | 72 ++--- .../tests/cases/roundtrip_physical_plan.rs | 20 +- 27 files changed, 442 insertions(+), 638 deletions(-) delete mode 100644 datafusion/physical-expr-functions-aggregate/Cargo.toml delete mode 100644 datafusion/physical-expr-functions-aggregate/src/lib.rs rename datafusion/{physical-expr-functions-aggregate => physical-expr}/src/aggregate.rs (69%) diff --git a/Cargo.toml b/Cargo.toml index 124747999041..479d2cadc65b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,6 @@ members = [ "datafusion/optimizer", "datafusion/physical-expr", "datafusion/physical-expr-common", - "datafusion/physical-expr-functions-aggregate", "datafusion/physical-optimizer", "datafusion/physical-plan", "datafusion/proto", @@ -106,7 +105,6 @@ datafusion-functions-window = { path = "datafusion/functions-window", version = datafusion-optimizer = { path = "datafusion/optimizer", version = "41.0.0", default-features = false } datafusion-physical-expr = { path = "datafusion/physical-expr", version = "41.0.0", default-features = false } datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "41.0.0", default-features = false } -datafusion-physical-expr-functions-aggregate = { path = "datafusion/physical-expr-functions-aggregate", version = "41.0.0" } datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "41.0.0" } datafusion-physical-plan = { path = "datafusion/physical-plan", version = "41.0.0" } datafusion-proto = { path = "datafusion/proto", version = "41.0.0" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 50333d17ca8d..1e89bb3af87e 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "adler32" version = "1.2.0" @@ -167,9 +173,9 @@ checksum = "9d151e35f61089500b617991b791fc8bfd237ae50cd5950803758a179b41e67a" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" @@ -430,7 +436,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -765,7 +771,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] @@ -815,9 +821,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9ec96fe9a81b5e365f9db71fe00edc4fe4ca2cc7dcb7861f0603012a7caa210" +checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" dependencies = [ "arrayref", "arrayvec", @@ -999,7 +1005,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1155,7 +1161,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1206,7 +1212,6 @@ dependencies = [ "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", - "datafusion-physical-expr-functions-aggregate", "datafusion-physical-optimizer", "datafusion-physical-plan", "datafusion-sql", @@ -1501,20 +1506,6 @@ dependencies = [ "rand", ] -[[package]] -name = "datafusion-physical-expr-functions-aggregate" -version = "41.0.0" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr-common", - "rand", -] - [[package]] name = "datafusion-physical-optimizer" version = "41.0.0" @@ -1546,7 +1537,6 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr", "datafusion-physical-expr-common", - "datafusion-physical-expr-functions-aggregate", "futures", "half", "hashbrown", @@ -1743,12 +1733,12 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.31" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +checksum = "9c0596c1eac1f9e04ed902702e9878208b336edc9d6fddc8a48387349bab3666" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.8.0", ] [[package]] @@ -1831,7 +1821,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1924,9 +1914,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ "atomic-waker", "bytes", @@ -2111,7 +2101,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "httparse", @@ -2148,7 +2138,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "rustls 0.23.12", - "rustls-native-certs 0.7.1", + "rustls-native-certs 0.7.2", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -2353,9 +2343,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.156" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5f43f184355eefb8d17fc948dbecf6c13be3c141f20d834ae842193a448c72a" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libflate" @@ -2489,6 +2479,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.0.2" @@ -2829,7 +2828,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3028,9 +3027,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", @@ -3074,15 +3073,15 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" dependencies = [ "base64 0.22.1", "bytes", "futures-core", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "http-body-util", @@ -3098,7 +3097,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.12", - "rustls-native-certs 0.7.1", + "rustls-native-certs 0.7.2", "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", @@ -3114,7 +3113,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg", + "windows-registry", ] [[package]] @@ -3253,9 +3252,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" dependencies = [ "openssl-probe", "rustls-pemfile 2.1.3", @@ -3421,7 +3420,7 @@ checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3563,7 +3562,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3609,7 +3608,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3622,7 +3621,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3644,9 +3643,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.74" +version = "2.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" dependencies = [ "proc-macro2", "quote", @@ -3658,6 +3657,9 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "tempfile" @@ -3704,7 +3706,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3774,9 +3776,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.2" +version = "1.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ "backtrace", "bytes", @@ -3798,7 +3800,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3895,7 +3897,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3940,7 +3942,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -4095,7 +4097,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", "wasm-bindgen-shared", ] @@ -4129,7 +4131,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4213,6 +4215,36 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -4361,16 +4393,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "xmlparser" version = "0.13.6" @@ -4404,7 +4426,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index adbba3eb31d6..de228e058096 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -110,7 +110,6 @@ datafusion-functions-window = { workspace = true } datafusion-optimizer = { workspace = true } datafusion-physical-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } -datafusion-physical-expr-functions-aggregate = { workspace = true } datafusion-physical-optimizer = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-sql = { workspace = true } diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 735a381586ad..67f3cb01c0a4 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -490,7 +490,6 @@ //! [`PhysicalOptimizerRule`]: datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule //! [`Schema`]: arrow::datatypes::Schema //! [`PhysicalExpr`]: physical_plan::PhysicalExpr -//! [`AggregateExpr`]: physical_plan::AggregateExpr //! [`RecordBatch`]: arrow::record_batch::RecordBatch //! [`RecordBatchReader`]: arrow::record_batch::RecordBatchReader //! [`Array`]: arrow::array::Array @@ -556,11 +555,6 @@ pub mod physical_expr_common { pub use datafusion_physical_expr_common::*; } -/// re-export of [`datafusion_physical_expr_functions_aggregate`] crate -pub mod physical_expr_functions_aggregate { - pub use datafusion_physical_expr_functions_aggregate::*; -} - /// re-export of [`datafusion_physical_expr`] crate pub mod physical_expr { pub use datafusion_physical_expr::*; diff --git a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs index 8cbb187f7bd2..1a12fc7de888 100644 --- a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs @@ -26,7 +26,8 @@ use crate::physical_plan::ExecutionPlan; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_physical_expr::{physical_exprs_equal, AggregateExpr, PhysicalExpr}; +use datafusion_physical_expr::aggregate::AggregateFunctionExpr; +use datafusion_physical_expr::{physical_exprs_equal, PhysicalExpr}; use datafusion_physical_optimizer::PhysicalOptimizerRule; /// CombinePartialFinalAggregate optimizer rule combines the adjacent Partial and Final AggregateExecs @@ -122,7 +123,7 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate { type GroupExprsRef<'a> = ( &'a PhysicalGroupBy, - &'a [Arc], + &'a [Arc], &'a [Option>], ); @@ -171,8 +172,8 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::sum::sum_udaf; + use datafusion_physical_expr::aggregate::AggregateExprBuilder; use datafusion_physical_expr::expressions::col; - use datafusion_physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; /// Runs the CombinePartialFinalAggregate optimizer and asserts the plan against the expected macro_rules! assert_optimized { @@ -224,7 +225,7 @@ mod tests { fn partial_aggregate_exec( input: Arc, group_by: PhysicalGroupBy, - aggr_expr: Vec>, + aggr_expr: Vec>, ) -> Arc { let schema = input.schema(); let n_aggr = aggr_expr.len(); @@ -244,7 +245,7 @@ mod tests { fn final_aggregate_exec( input: Arc, group_by: PhysicalGroupBy, - aggr_expr: Vec>, + aggr_expr: Vec>, ) -> Arc { let schema = input.schema(); let n_aggr = aggr_expr.len(); @@ -272,7 +273,7 @@ mod tests { expr: Arc, name: &str, schema: &Schema, - ) -> Arc { + ) -> Arc { AggregateExprBuilder::new(count_udaf(), vec![expr]) .schema(Arc::new(schema.clone())) .alias(name) diff --git a/datafusion/core/src/physical_optimizer/update_aggr_exprs.rs b/datafusion/core/src/physical_optimizer/update_aggr_exprs.rs index f8edf73e3d2a..a2726d62e9f6 100644 --- a/datafusion/core/src/physical_optimizer/update_aggr_exprs.rs +++ b/datafusion/core/src/physical_optimizer/update_aggr_exprs.rs @@ -23,8 +23,9 @@ use std::sync::Arc; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{plan_datafusion_err, Result}; +use datafusion_physical_expr::aggregate::AggregateFunctionExpr; use datafusion_physical_expr::{ - reverse_order_bys, AggregateExpr, EquivalenceProperties, PhysicalSortRequirement, + reverse_order_bys, EquivalenceProperties, PhysicalSortRequirement, }; use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::aggregates::concat_slices; @@ -117,7 +118,7 @@ impl PhysicalOptimizerRule for OptimizeAggregateOrder { /// /// # Parameters /// -/// * `aggr_exprs` - A vector of `Arc` representing the +/// * `aggr_exprs` - A vector of `Arc` representing the /// aggregate expressions to be optimized. /// * `prefix_requirement` - An array slice representing the ordering /// requirements preceding the aggregate expressions. @@ -130,10 +131,10 @@ impl PhysicalOptimizerRule for OptimizeAggregateOrder { /// successfully. Any errors occurring during the conversion process are /// passed through. fn try_convert_aggregate_if_better( - aggr_exprs: Vec>, + aggr_exprs: Vec>, prefix_requirement: &[PhysicalSortRequirement], eq_properties: &EquivalenceProperties, -) -> Result>> { +) -> Result>> { aggr_exprs .into_iter() .map(|aggr_expr| { diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 8d6c5089fa34..9501d3c6bbbb 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -58,8 +58,8 @@ use crate::physical_plan::unnest::UnnestExec; use crate::physical_plan::values::ValuesExec; use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use crate::physical_plan::{ - displayable, windows, AggregateExpr, ExecutionPlan, ExecutionPlanProperties, - InputOrderMode, Partitioning, PhysicalExpr, WindowExpr, + displayable, windows, ExecutionPlan, ExecutionPlanProperties, InputOrderMode, + Partitioning, PhysicalExpr, WindowExpr, }; use arrow::compute::SortOptions; @@ -81,9 +81,9 @@ use datafusion_expr::{ DescribeTable, DmlStatement, Extension, Filter, RecursiveQuery, StringifiedPlan, WindowFrame, WindowFrameBound, WriteOp, }; +use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr}; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::LexOrdering; -use datafusion_physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_sql::utils::window_expr_common_partition_keys; @@ -719,7 +719,7 @@ impl DefaultPhysicalPlanner { // optimization purposes. For example, a FIRST_VALUE may turn // into a LAST_VALUE with the reverse ordering requirement. // To reflect such changes to subsequent stages, use the updated - // `AggregateExpr`/`PhysicalSortExpr` objects. + // `AggregateFunctionExpr`/`PhysicalSortExpr` objects. let updated_aggregates = initial_aggr.aggr_expr().to_vec(); let next_partition_mode = if can_repartition { @@ -1541,7 +1541,7 @@ pub fn create_window_expr( } type AggregateExprWithOptionalArgs = ( - Arc, + Arc, // The filter clause, if any Option>, // Ordering requirements, if any diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index ca8376fdec0a..faa9378535fd 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -48,13 +48,11 @@ use datafusion_common::TableReference; use datafusion_expr::utils::COUNT_STAR_EXPANSION; use datafusion_expr::{CreateExternalTable, Expr, TableType}; use datafusion_functions_aggregate::count::count_udaf; -use datafusion_physical_expr::{ - expressions, AggregateExpr, EquivalenceProperties, PhysicalExpr, -}; +use datafusion_physical_expr::{expressions, EquivalenceProperties, PhysicalExpr}; use async_trait::async_trait; use datafusion_catalog::Session; -use datafusion_physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; +use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr}; use futures::Stream; use tempfile::TempDir; // backwards compatibility @@ -429,7 +427,7 @@ impl TestAggregate { } /// Return appropriate expr depending if COUNT is for col or table (*) - pub fn count_expr(&self, schema: &Schema) -> Arc { + pub fn count_expr(&self, schema: &Schema) -> Arc { AggregateExprBuilder::new(count_udaf(), vec![self.column()]) .schema(Arc::new(schema.clone())) .alias(self.column_name()) diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 138e5bda7f39..62e9be63983c 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -25,7 +25,7 @@ use arrow::util::pretty::pretty_format_batches; use arrow_array::types::Int64Type; use datafusion::common::Result; use datafusion::datasource::MemTable; -use datafusion::physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; +use datafusion::physical_expr::aggregate::AggregateExprBuilder; use datafusion::physical_plan::aggregates::{ AggregateExec, AggregateMode, PhysicalGroupBy, }; diff --git a/datafusion/functions-aggregate-common/src/aggregate.rs b/datafusion/functions-aggregate-common/src/aggregate.rs index 698d1350cb61..c9cbaa8396fc 100644 --- a/datafusion/functions-aggregate-common/src/aggregate.rs +++ b/datafusion/functions-aggregate-common/src/aggregate.rs @@ -15,172 +15,5 @@ // specific language governing permissions and limitations // under the License. -//! [`AggregateExpr`] which defines the interface all aggregate expressions -//! (built-in and custom) need to satisfy. - -use crate::order::AggregateOrderSensitivity; -use arrow::datatypes::{DataType, Field}; -use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue}; -use datafusion_expr_common::accumulator::Accumulator; -use datafusion_expr_common::groups_accumulator::GroupsAccumulator; -use datafusion_physical_expr_common::physical_expr::PhysicalExpr; -use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; -use std::fmt::Debug; -use std::{any::Any, sync::Arc}; - pub mod count_distinct; pub mod groups_accumulator; - -/// An aggregate expression that: -/// * knows its resulting field -/// * knows how to create its accumulator -/// * knows its accumulator's state's field -/// * knows the expressions from whose its accumulator will receive values -/// -/// Any implementation of this trait also needs to implement the -/// `PartialEq` to allows comparing equality between the -/// trait objects. -pub trait AggregateExpr: Send + Sync + Debug + PartialEq { - /// Returns the aggregate expression as [`Any`] so that it can be - /// downcast to a specific implementation. - fn as_any(&self) -> &dyn Any; - - /// the field of the final result of this aggregation. - fn field(&self) -> Result; - - /// the accumulator used to accumulate values from the expressions. - /// the accumulator expects the same number of arguments as `expressions` and must - /// return states with the same description as `state_fields` - fn create_accumulator(&self) -> Result>; - - /// the fields that encapsulate the Accumulator's state - /// the number of fields here equals the number of states that the accumulator contains - fn state_fields(&self) -> Result>; - - /// expressions that are passed to the Accumulator. - /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. - fn expressions(&self) -> Vec>; - - /// Order by requirements for the aggregate function - /// By default it is `None` (there is no requirement) - /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - None - } - - /// Indicates whether aggregator can produce the correct result with any - /// arbitrary input ordering. By default, we assume that aggregate expressions - /// are order insensitive. - fn order_sensitivity(&self) -> AggregateOrderSensitivity { - AggregateOrderSensitivity::Insensitive - } - - /// Sets the indicator whether ordering requirements of the aggregator is - /// satisfied by its input. If this is not the case, aggregators with order - /// sensitivity `AggregateOrderSensitivity::Beneficial` can still produce - /// the correct result with possibly more work internally. - /// - /// # Returns - /// - /// Returns `Ok(Some(updated_expr))` if the process completes successfully. - /// If the expression can benefit from existing input ordering, but does - /// not implement the method, returns an error. Order insensitive and hard - /// requirement aggregators return `Ok(None)`. - fn with_beneficial_ordering( - self: Arc, - _requirement_satisfied: bool, - ) -> Result>> { - if self.order_bys().is_some() && self.order_sensitivity().is_beneficial() { - return exec_err!( - "Should implement with satisfied for aggregator :{:?}", - self.name() - ); - } - Ok(None) - } - - /// Human readable name such as `"MIN(c2)"`. The default - /// implementation returns placeholder text. - fn name(&self) -> &str { - "AggregateExpr: default name" - } - - /// If the aggregate expression has a specialized - /// [`GroupsAccumulator`] implementation. If this returns true, - /// `[Self::create_groups_accumulator`] will be called. - fn groups_accumulator_supported(&self) -> bool { - false - } - - /// Return a specialized [`GroupsAccumulator`] that manages state - /// for all groups. - /// - /// For maximum performance, a [`GroupsAccumulator`] should be - /// implemented in addition to [`Accumulator`]. - fn create_groups_accumulator(&self) -> Result> { - not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet") - } - - /// Construct an expression that calculates the aggregate in reverse. - /// Typically the "reverse" expression is itself (e.g. SUM, COUNT). - /// For aggregates that do not support calculation in reverse, - /// returns None (which is the default value). - fn reverse_expr(&self) -> Option> { - None - } - - /// Creates accumulator implementation that supports retract - fn create_sliding_accumulator(&self) -> Result> { - not_impl_err!("Retractable Accumulator hasn't been implemented for {self:?} yet") - } - - /// Returns all expressions used in the [`AggregateExpr`]. - /// These expressions are (1)function arguments, (2) order by expressions. - fn all_expressions(&self) -> AggregatePhysicalExpressions { - let args = self.expressions(); - let order_bys = self.order_bys().unwrap_or(&[]); - let order_by_exprs = order_bys - .iter() - .map(|sort_expr| Arc::clone(&sort_expr.expr)) - .collect::>(); - AggregatePhysicalExpressions { - args, - order_by_exprs, - } - } - - /// Rewrites [`AggregateExpr`], with new expressions given. The argument should be consistent - /// with the return value of the [`AggregateExpr::all_expressions`] method. - /// Returns `Some(Arc)` if re-write is supported, otherwise returns `None`. - fn with_new_expressions( - &self, - _args: Vec>, - _order_by_exprs: Vec>, - ) -> Option> { - None - } - - /// If this function is max, return (output_field, true) - /// if the function is min, return (output_field, false) - /// otherwise return None (the default) - /// - /// output_field is the name of the column produced by this aggregate - /// - /// Note: this is used to use special aggregate implementations in certain conditions - fn get_minmax_desc(&self) -> Option<(Field, bool)> { - None - } - - /// Returns default value of the function given the input is Null - /// Most of the aggregate function return Null if input is Null, - /// while `count` returns 0 if input is Null - fn default_value(&self, data_type: &DataType) -> Result; -} - -/// Stores the physical expressions used inside the `AggregateExpr`. -pub struct AggregatePhysicalExpressions { - /// Aggregate function arguments - pub args: Vec>, - /// Order by expressions - pub order_by_exprs: Vec>, -} diff --git a/datafusion/functions-aggregate-common/src/utils.rs b/datafusion/functions-aggregate-common/src/utils.rs index 7b8ce0397af8..4fba772d8ddc 100644 --- a/datafusion/functions-aggregate-common/src/utils.rs +++ b/datafusion/functions-aggregate-common/src/utils.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{any::Any, sync::Arc}; +use std::sync::Arc; use arrow::array::{ArrayRef, AsArray}; use arrow::datatypes::ArrowNativeType; @@ -32,25 +32,6 @@ use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr_common::accumulator::Accumulator; use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; -use crate::aggregate::AggregateExpr; - -/// Downcast a `Box` or `Arc` -/// and return the inner trait object as [`Any`] so -/// that it can be downcast to a specific implementation. -/// -/// This method is used when implementing the `PartialEq` -/// for [`AggregateExpr`] aggregation expressions and allows comparing the equality -/// between the trait objects. -pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { - if let Some(obj) = any.downcast_ref::>() { - obj.as_any() - } else if let Some(obj) = any.downcast_ref::>() { - obj.as_any() - } else { - any - } -} - /// Convert scalar values from an accumulator into arrays. pub fn get_accum_scalar_values_as_arrays( accum: &mut dyn Accumulator, diff --git a/datafusion/physical-expr-functions-aggregate/Cargo.toml b/datafusion/physical-expr-functions-aggregate/Cargo.toml deleted file mode 100644 index 6eed89614c53..000000000000 --- a/datafusion/physical-expr-functions-aggregate/Cargo.toml +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "datafusion-physical-expr-functions-aggregate" -description = "Logical plan and expression representation for DataFusion query engine" -keywords = ["datafusion", "logical", "plan", "expressions"] -readme = "README.md" -version = { workspace = true } -edition = { workspace = true } -homepage = { workspace = true } -repository = { workspace = true } -license = { workspace = true } -authors = { workspace = true } -rust-version = { workspace = true } - -[lints] -workspace = true - -[lib] -name = "datafusion_physical_expr_functions_aggregate" -path = "src/lib.rs" - -[features] - -[dependencies] -ahash = { workspace = true } -arrow = { workspace = true } -datafusion-common = { workspace = true } -datafusion-expr = { workspace = true } -datafusion-expr-common = { workspace = true } -datafusion-functions-aggregate-common = { workspace = true } -datafusion-physical-expr-common = { workspace = true } -rand = { workspace = true } diff --git a/datafusion/physical-expr-functions-aggregate/src/lib.rs b/datafusion/physical-expr-functions-aggregate/src/lib.rs deleted file mode 100644 index 2ff7ff5777ec..000000000000 --- a/datafusion/physical-expr-functions-aggregate/src/lib.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Technically, all aggregate functions that depend on `expr` crate should be included here. - -pub mod aggregate; diff --git a/datafusion/physical-expr-functions-aggregate/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs similarity index 69% rename from datafusion/physical-expr-functions-aggregate/src/aggregate.rs rename to datafusion/physical-expr/src/aggregate.rs index fd986e00a7ef..5c1216f2a386 100644 --- a/datafusion/physical-expr-functions-aggregate/src/aggregate.rs +++ b/datafusion/physical-expr/src/aggregate.rs @@ -15,29 +15,46 @@ // specific language governing permissions and limitations // under the License. +pub(crate) mod groups_accumulator { + #[allow(unused_imports)] + pub(crate) mod accumulate { + pub use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState; + } + pub use datafusion_functions_aggregate_common::aggregate::groups_accumulator::{ + accumulate::NullState, GroupsAccumulatorAdapter, + }; +} +pub(crate) mod stats { + pub use datafusion_functions_aggregate_common::stats::StatsType; +} +pub mod utils { + pub use datafusion_functions_aggregate_common::utils::{ + adjust_output_array, get_accum_scalar_values_as_arrays, get_sort_options, + ordering_fields, DecimalAverager, Hashable, + }; +} + use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::ScalarValue; use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_expr::AggregateUDF; use datafusion_expr::ReversedUDAF; use datafusion_expr_common::accumulator::Accumulator; -use datafusion_expr_common::groups_accumulator::GroupsAccumulator; use datafusion_expr_common::type_coercion::aggregates::check_arg_count; use datafusion_functions_aggregate_common::accumulator::AccumulatorArgs; use datafusion_functions_aggregate_common::accumulator::StateFieldsArgs; -use datafusion_functions_aggregate_common::aggregate::AggregateExpr; use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity; -use datafusion_functions_aggregate_common::utils::{self, down_cast_any_ref}; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use datafusion_physical_expr_common::utils::reverse_order_bys; +use datafusion_expr_common::groups_accumulator::GroupsAccumulator; use std::fmt::Debug; -use std::{any::Any, sync::Arc}; +use std::sync::Arc; -/// Builder for physical [`AggregateExpr`] +/// Builder for physical [`AggregateFunctionExpr`] /// -/// `AggregateExpr` contains the information necessary to call +/// `AggregateFunctionExpr` contains the information necessary to call /// an aggregate expression. #[derive(Debug, Clone)] pub struct AggregateExprBuilder { @@ -71,7 +88,7 @@ impl AggregateExprBuilder { } } - pub fn build(self) -> Result> { + pub fn build(self) -> Result> { let Self { fun, args, @@ -204,6 +221,17 @@ impl AggregateFunctionExpr { &self.fun } + /// expressions that are passed to the Accumulator. + /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. + pub fn expressions(&self) -> Vec> { + self.args.clone() + } + + /// Human readable name such as `"MIN(c2)"`. + pub fn name(&self) -> &str { + &self.name + } + /// Return if the aggregation is distinct pub fn is_distinct(&self) -> bool { self.is_distinct @@ -219,34 +247,13 @@ impl AggregateFunctionExpr { self.is_reversed } + /// Return if the aggregation is nullable pub fn is_nullable(&self) -> bool { self.is_nullable } -} -impl AggregateExpr for AggregateFunctionExpr { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn expressions(&self) -> Vec> { - self.args.clone() - } - - fn state_fields(&self) -> Result> { - let args = StateFieldsArgs { - name: &self.name, - input_types: &self.input_types, - return_type: &self.data_type, - ordering_fields: &self.ordering_fields, - is_distinct: self.is_distinct, - }; - - self.fun.state_fields(args) - } - - fn field(&self) -> Result { + /// the field of the final result of this aggregation. + pub fn field(&self) -> Result { Ok(Field::new( &self.name, self.data_type.clone(), @@ -254,7 +261,10 @@ impl AggregateExpr for AggregateFunctionExpr { )) } - fn create_accumulator(&self) -> Result> { + /// the accumulator used to accumulate values from the expressions. + /// the accumulator expects the same number of arguments as `expressions` and must + /// return states with the same description as `state_fields` + pub fn create_accumulator(&self) -> Result> { let acc_args = AccumulatorArgs { return_type: &self.data_type, schema: &self.schema, @@ -269,7 +279,83 @@ impl AggregateExpr for AggregateFunctionExpr { self.fun.accumulator(acc_args) } - fn create_sliding_accumulator(&self) -> Result> { + /// the field of the final result of this aggregation. + pub fn state_fields(&self) -> Result> { + let args = StateFieldsArgs { + name: &self.name, + input_types: &self.input_types, + return_type: &self.data_type, + ordering_fields: &self.ordering_fields, + is_distinct: self.is_distinct, + }; + + self.fun.state_fields(args) + } + + /// Order by requirements for the aggregate function + /// By default it is `None` (there is no requirement) + /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this + pub fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { + if self.ordering_req.is_empty() { + return None; + } + + if !self.order_sensitivity().is_insensitive() { + return Some(&self.ordering_req); + } + + None + } + + /// Indicates whether aggregator can produce the correct result with any + /// arbitrary input ordering. By default, we assume that aggregate expressions + /// are order insensitive. + pub fn order_sensitivity(&self) -> AggregateOrderSensitivity { + if !self.ordering_req.is_empty() { + // If there is requirement, use the sensitivity of the implementation + self.fun.order_sensitivity() + } else { + // If no requirement, aggregator is order insensitive + AggregateOrderSensitivity::Insensitive + } + } + + /// Sets the indicator whether ordering requirements of the aggregator is + /// satisfied by its input. If this is not the case, aggregators with order + /// sensitivity `AggregateOrderSensitivity::Beneficial` can still produce + /// the correct result with possibly more work internally. + /// + /// # Returns + /// + /// Returns `Ok(Some(updated_expr))` if the process completes successfully. + /// If the expression can benefit from existing input ordering, but does + /// not implement the method, returns an error. Order insensitive and hard + /// requirement aggregators return `Ok(None)`. + pub fn with_beneficial_ordering( + self: Arc, + beneficial_ordering: bool, + ) -> Result>> { + let Some(updated_fn) = self + .fun + .clone() + .with_beneficial_ordering(beneficial_ordering)? + else { + return Ok(None); + }; + + AggregateExprBuilder::new(Arc::new(updated_fn), self.args.to_vec()) + .order_by(self.ordering_req.to_vec()) + .schema(Arc::new(self.schema.clone())) + .alias(self.name().to_string()) + .with_ignore_nulls(self.ignore_nulls) + .with_distinct(self.is_distinct) + .with_reversed(self.is_reversed) + .build() + .map(Some) + } + + /// Creates accumulator implementation that supports retract + pub fn create_sliding_accumulator(&self) -> Result> { let args = AccumulatorArgs { return_type: &self.data_type, schema: &self.schema, @@ -335,11 +421,10 @@ impl AggregateExpr for AggregateFunctionExpr { Ok(accumulator) } - fn name(&self) -> &str { - &self.name - } - - fn groups_accumulator_supported(&self) -> bool { + /// If the aggregate expression has a specialized + /// [`GroupsAccumulator`] implementation. If this returns true, + /// `[Self::create_groups_accumulator`] will be called. + pub fn groups_accumulator_supported(&self) -> bool { let args = AccumulatorArgs { return_type: &self.data_type, schema: &self.schema, @@ -353,7 +438,12 @@ impl AggregateExpr for AggregateFunctionExpr { self.fun.groups_accumulator_supported(args) } - fn create_groups_accumulator(&self) -> Result> { + /// Return a specialized [`GroupsAccumulator`] that manages state + /// for all groups. + /// + /// For maximum performance, a [`GroupsAccumulator`] should be + /// implemented in addition to [`Accumulator`]. + pub fn create_groups_accumulator(&self) -> Result> { let args = AccumulatorArgs { return_type: &self.data_type, schema: &self.schema, @@ -367,52 +457,11 @@ impl AggregateExpr for AggregateFunctionExpr { self.fun.create_groups_accumulator(args) } - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - if self.ordering_req.is_empty() { - return None; - } - - if !self.order_sensitivity().is_insensitive() { - return Some(&self.ordering_req); - } - - None - } - - fn order_sensitivity(&self) -> AggregateOrderSensitivity { - if !self.ordering_req.is_empty() { - // If there is requirement, use the sensitivity of the implementation - self.fun.order_sensitivity() - } else { - // If no requirement, aggregator is order insensitive - AggregateOrderSensitivity::Insensitive - } - } - - fn with_beneficial_ordering( - self: Arc, - beneficial_ordering: bool, - ) -> Result>> { - let Some(updated_fn) = self - .fun - .clone() - .with_beneficial_ordering(beneficial_ordering)? - else { - return Ok(None); - }; - - AggregateExprBuilder::new(Arc::new(updated_fn), self.args.to_vec()) - .order_by(self.ordering_req.to_vec()) - .schema(Arc::new(self.schema.clone())) - .alias(self.name().to_string()) - .with_ignore_nulls(self.ignore_nulls) - .with_distinct(self.is_distinct) - .with_reversed(self.is_reversed) - .build() - .map(Some) - } - - fn reverse_expr(&self) -> Option> { + /// Construct an expression that calculates the aggregate in reverse. + /// Typically the "reverse" expression is itself (e.g. SUM, COUNT). + /// For aggregates that do not support calculation in reverse, + /// returns None (which is the default value). + pub fn reverse_expr(&self) -> Option> { match self.fun.reverse_udf() { ReversedUDAF::NotSupported => None, ReversedUDAF::Identical => Some(Arc::new(self.clone())), @@ -440,33 +489,72 @@ impl AggregateExpr for AggregateFunctionExpr { } } - fn get_minmax_desc(&self) -> Option<(Field, bool)> { + /// Returns all expressions used in the [`AggregateFunctionExpr`]. + /// These expressions are (1)function arguments, (2) order by expressions. + pub fn all_expressions(&self) -> AggregatePhysicalExpressions { + let args = self.expressions(); + let order_bys = self.order_bys().unwrap_or(&[]); + let order_by_exprs = order_bys + .iter() + .map(|sort_expr| Arc::clone(&sort_expr.expr)) + .collect::>(); + AggregatePhysicalExpressions { + args, + order_by_exprs, + } + } + + /// Rewrites [`AggregateFunctionExpr`], with new expressions given. The argument should be consistent + /// with the return value of the [`AggregateFunctionExpr::all_expressions`] method. + /// Returns `Some(Arc)` if re-write is supported, otherwise returns `None`. + pub fn with_new_expressions( + &self, + _args: Vec>, + _order_by_exprs: Vec>, + ) -> Option> { + None + } + + /// If this function is max, return (output_field, true) + /// if the function is min, return (output_field, false) + /// otherwise return None (the default) + /// + /// output_field is the name of the column produced by this aggregate + /// + /// Note: this is used to use special aggregate implementations in certain conditions + pub fn get_minmax_desc(&self) -> Option<(Field, bool)> { self.fun .is_descending() .and_then(|flag| self.field().ok().map(|f| (f, flag))) } - fn default_value(&self, data_type: &DataType) -> Result { + /// Returns default value of the function given the input is Null + /// Most of the aggregate function return Null if input is Null, + /// while `count` returns 0 if input is Null + pub fn default_value(&self, data_type: &DataType) -> Result { self.fun.default_value(data_type) } } -impl PartialEq for AggregateFunctionExpr { - fn eq(&self, other: &dyn Any) -> bool { - down_cast_any_ref(other) - .downcast_ref::() - .map(|x| { - self.name == x.name - && self.data_type == x.data_type - && self.fun == x.fun - && self.args.len() == x.args.len() - && self - .args - .iter() - .zip(x.args.iter()) - .all(|(this_arg, other_arg)| this_arg.eq(other_arg)) - }) - .unwrap_or(false) +/// Stores the physical expressions used inside the `AggregateExpr`. +pub struct AggregatePhysicalExpressions { + /// Aggregate function arguments + pub args: Vec>, + /// Order by expressions + pub order_by_exprs: Vec>, +} + +impl PartialEq for AggregateFunctionExpr { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + && self.data_type == other.data_type + && self.fun == other.fun + && self.args.len() == other.args.len() + && self + .args + .iter() + .zip(other.args.iter()) + .all(|(this_arg, other_arg)| this_arg.eq(other_arg)) } } diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index c4255172d680..7db7188b85d3 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -19,27 +19,7 @@ #![deny(clippy::clone_on_ref_ptr)] // Backward compatibility -pub mod aggregate { - pub(crate) mod groups_accumulator { - #[allow(unused_imports)] - pub(crate) mod accumulate { - pub use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState; - } - pub use datafusion_functions_aggregate_common::aggregate::groups_accumulator::{ - accumulate::NullState, GroupsAccumulatorAdapter, - }; - } - pub(crate) mod stats { - pub use datafusion_functions_aggregate_common::stats::StatsType; - } - pub mod utils { - pub use datafusion_functions_aggregate_common::utils::{ - adjust_output_array, down_cast_any_ref, get_accum_scalar_values_as_arrays, - get_sort_options, ordering_fields, DecimalAverager, Hashable, - }; - } - pub use datafusion_functions_aggregate_common::aggregate::AggregateExpr; -} +pub mod aggregate; pub mod analysis; pub mod binary_map { pub use datafusion_physical_expr_common::binary_map::{ArrowBytesSet, OutputType}; @@ -67,9 +47,6 @@ pub mod execution_props { pub use aggregate::groups_accumulator::{GroupsAccumulatorAdapter, NullState}; pub use analysis::{analyze, AnalysisContext, ExprBoundaries}; -pub use datafusion_functions_aggregate_common::aggregate::{ - AggregateExpr, AggregatePhysicalExpressions, -}; pub use equivalence::{calculate_union, ConstExpr, EquivalenceProperties}; pub use partitioning::{Distribution, Partitioning}; pub use physical_expr::{ diff --git a/datafusion/physical-expr/src/window/aggregate.rs b/datafusion/physical-expr/src/window/aggregate.rs index 52015f425217..5439e140502a 100644 --- a/datafusion/physical-expr/src/window/aggregate.rs +++ b/datafusion/physical-expr/src/window/aggregate.rs @@ -29,20 +29,19 @@ use datafusion_common::ScalarValue; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{Accumulator, WindowFrame}; +use crate::aggregate::AggregateFunctionExpr; use crate::window::window_expr::AggregateWindowExpr; use crate::window::{ PartitionBatches, PartitionWindowAggStates, SlidingAggregateWindowExpr, WindowExpr, }; -use crate::{ - expressions::PhysicalSortExpr, reverse_order_bys, AggregateExpr, PhysicalExpr, -}; +use crate::{expressions::PhysicalSortExpr, reverse_order_bys, PhysicalExpr}; /// A window expr that takes the form of an aggregate function. /// /// See comments on [`WindowExpr`] for more details. #[derive(Debug)] pub struct PlainAggregateWindowExpr { - aggregate: Arc, + aggregate: Arc, partition_by: Vec>, order_by: Vec, window_frame: Arc, @@ -51,7 +50,7 @@ pub struct PlainAggregateWindowExpr { impl PlainAggregateWindowExpr { /// Create a new aggregate window function expression pub fn new( - aggregate: Arc, + aggregate: Arc, partition_by: &[Arc], order_by: &[PhysicalSortExpr], window_frame: Arc, @@ -65,7 +64,7 @@ impl PlainAggregateWindowExpr { } /// Get aggregate expr of AggregateWindowExpr - pub fn get_aggregate_expr(&self) -> &Arc { + pub fn get_aggregate_expr(&self) -> &Arc { &self.aggregate } } diff --git a/datafusion/physical-expr/src/window/sliding_aggregate.rs b/datafusion/physical-expr/src/window/sliding_aggregate.rs index afa799e86953..ac3a4f4c09ec 100644 --- a/datafusion/physical-expr/src/window/sliding_aggregate.rs +++ b/datafusion/physical-expr/src/window/sliding_aggregate.rs @@ -28,13 +28,12 @@ use arrow::record_batch::RecordBatch; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{Accumulator, WindowFrame}; +use crate::aggregate::AggregateFunctionExpr; use crate::window::window_expr::AggregateWindowExpr; use crate::window::{ PartitionBatches, PartitionWindowAggStates, PlainAggregateWindowExpr, WindowExpr, }; -use crate::{ - expressions::PhysicalSortExpr, reverse_order_bys, AggregateExpr, PhysicalExpr, -}; +use crate::{expressions::PhysicalSortExpr, reverse_order_bys, PhysicalExpr}; /// A window expr that takes the form of an aggregate function that /// can be incrementally computed over sliding windows. @@ -42,7 +41,7 @@ use crate::{ /// See comments on [`WindowExpr`] for more details. #[derive(Debug)] pub struct SlidingAggregateWindowExpr { - aggregate: Arc, + aggregate: Arc, partition_by: Vec>, order_by: Vec, window_frame: Arc, @@ -51,7 +50,7 @@ pub struct SlidingAggregateWindowExpr { impl SlidingAggregateWindowExpr { /// Create a new (sliding) aggregate window function expression. pub fn new( - aggregate: Arc, + aggregate: Arc, partition_by: &[Arc], order_by: &[PhysicalSortExpr], window_frame: Arc, @@ -64,8 +63,8 @@ impl SlidingAggregateWindowExpr { } } - /// Get the [AggregateExpr] of this object. - pub fn get_aggregate_expr(&self) -> &Arc { + /// Get the [AggregateFunctionExpr] of this object. + pub fn get_aggregate_expr(&self) -> &Arc { &self.aggregate } } diff --git a/datafusion/physical-optimizer/src/aggregate_statistics.rs b/datafusion/physical-optimizer/src/aggregate_statistics.rs index 66b250c5063b..2b8725b5bac7 100644 --- a/datafusion/physical-optimizer/src/aggregate_statistics.rs +++ b/datafusion/physical-optimizer/src/aggregate_statistics.rs @@ -23,7 +23,7 @@ use datafusion_common::scalar::ScalarValue; use datafusion_common::Result; use datafusion_physical_plan::aggregates::AggregateExec; use datafusion_physical_plan::projection::ProjectionExec; -use datafusion_physical_plan::{expressions, AggregateExpr, ExecutionPlan, Statistics}; +use datafusion_physical_plan::{expressions, ExecutionPlan, Statistics}; use crate::PhysicalOptimizerRule; use datafusion_common::stats::Precision; @@ -58,12 +58,12 @@ impl PhysicalOptimizerRule for AggregateStatistics { let mut projections = vec![]; for expr in partial_agg_exec.aggr_expr() { if let Some((non_null_rows, name)) = - take_optimizable_column_and_table_count(&**expr, &stats) + take_optimizable_column_and_table_count(expr, &stats) { projections.push((expressions::lit(non_null_rows), name.to_owned())); - } else if let Some((min, name)) = take_optimizable_min(&**expr, &stats) { + } else if let Some((min, name)) = take_optimizable_min(expr, &stats) { projections.push((expressions::lit(min), name.to_owned())); - } else if let Some((max, name)) = take_optimizable_max(&**expr, &stats) { + } else if let Some((max, name)) = take_optimizable_max(expr, &stats) { projections.push((expressions::lit(max), name.to_owned())); } else { // TODO: we need all aggr_expr to be resolved (cf TODO fullres) @@ -137,7 +137,7 @@ fn take_optimizable(node: &dyn ExecutionPlan) -> Option> /// If this agg_expr is a count that can be exactly derived from the statistics, return it. fn take_optimizable_column_and_table_count( - agg_expr: &dyn AggregateExpr, + agg_expr: &AggregateFunctionExpr, stats: &Statistics, ) -> Option<(ScalarValue, String)> { let col_stats = &stats.column_statistics; @@ -174,7 +174,7 @@ fn take_optimizable_column_and_table_count( /// If this agg_expr is a min that is exactly defined in the statistics, return it. fn take_optimizable_min( - agg_expr: &dyn AggregateExpr, + agg_expr: &AggregateFunctionExpr, stats: &Statistics, ) -> Option<(ScalarValue, String)> { if let Precision::Exact(num_rows) = &stats.num_rows { @@ -220,7 +220,7 @@ fn take_optimizable_min( /// If this agg_expr is a max that is exactly defined in the statistics, return it. fn take_optimizable_max( - agg_expr: &dyn AggregateExpr, + agg_expr: &AggregateFunctionExpr, stats: &Statistics, ) -> Option<(ScalarValue, String)> { if let Precision::Exact(num_rows) = &stats.num_rows { @@ -266,33 +266,27 @@ fn take_optimizable_max( // TODO: Move this check into AggregateUDFImpl // https://github.com/apache/datafusion/issues/11153 -fn is_non_distinct_count(agg_expr: &dyn AggregateExpr) -> bool { - if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { - if agg_expr.fun().name() == "count" && !agg_expr.is_distinct() { - return true; - } +fn is_non_distinct_count(agg_expr: &AggregateFunctionExpr) -> bool { + if agg_expr.fun().name() == "count" && !agg_expr.is_distinct() { + return true; } false } // TODO: Move this check into AggregateUDFImpl // https://github.com/apache/datafusion/issues/11153 -fn is_min(agg_expr: &dyn AggregateExpr) -> bool { - if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { - if agg_expr.fun().name().to_lowercase() == "min" { - return true; - } +fn is_min(agg_expr: &AggregateFunctionExpr) -> bool { + if agg_expr.fun().name().to_lowercase() == "min" { + return true; } false } // TODO: Move this check into AggregateUDFImpl // https://github.com/apache/datafusion/issues/11153 -fn is_max(agg_expr: &dyn AggregateExpr) -> bool { - if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { - if agg_expr.fun().name().to_lowercase() == "max" { - return true; - } +fn is_max(agg_expr: &AggregateFunctionExpr) -> bool { + if agg_expr.fun().name().to_lowercase() == "max" { + return true; } false } diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 78da4dc9c53f..24387c5f15ee 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -55,7 +55,6 @@ datafusion-functions-aggregate = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-physical-expr = { workspace = true, default-features = true } datafusion-physical-expr-common = { workspace = true } -datafusion-physical-expr-functions-aggregate = { workspace = true } futures = { workspace = true } half = { workspace = true } hashbrown = { workspace = true } diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 5aa255e7c341..0aeaa15b5f1d 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -42,10 +42,11 @@ use datafusion_expr::Accumulator; use datafusion_physical_expr::{ equivalence::{collapse_lex_req, ProjectionMapping}, expressions::{Column, UnKnownColumn}, - physical_exprs_contains, AggregateExpr, EquivalenceProperties, LexOrdering, - LexRequirement, PhysicalExpr, PhysicalSortRequirement, + physical_exprs_contains, EquivalenceProperties, LexOrdering, LexRequirement, + PhysicalExpr, PhysicalSortRequirement, }; +use datafusion_physical_expr::aggregate::AggregateFunctionExpr; use itertools::Itertools; pub mod group_values; @@ -253,7 +254,7 @@ pub struct AggregateExec { /// Group by expressions group_by: PhysicalGroupBy, /// Aggregate expressions - aggr_expr: Vec>, + aggr_expr: Vec>, /// FILTER (WHERE clause) expression for each aggregate expression filter_expr: Vec>>, /// Set if the output of this aggregation is truncated by a upstream sort/limit clause @@ -280,7 +281,10 @@ impl AggregateExec { /// Function used in `ConvertFirstLast` optimizer rule, /// where we need parts of the new value, others cloned from the old one /// Rewrites aggregate exec with new aggregate expressions. - pub fn with_new_aggr_exprs(&self, aggr_expr: Vec>) -> Self { + pub fn with_new_aggr_exprs( + &self, + aggr_expr: Vec>, + ) -> Self { Self { aggr_expr, // clone the rest of the fields @@ -306,7 +310,7 @@ impl AggregateExec { pub fn try_new( mode: AggregateMode, group_by: PhysicalGroupBy, - aggr_expr: Vec>, + aggr_expr: Vec>, filter_expr: Vec>>, input: Arc, input_schema: SchemaRef, @@ -343,7 +347,7 @@ impl AggregateExec { fn try_new_with_schema( mode: AggregateMode, group_by: PhysicalGroupBy, - mut aggr_expr: Vec>, + mut aggr_expr: Vec>, filter_expr: Vec>>, input: Arc, input_schema: SchemaRef, @@ -451,7 +455,7 @@ impl AggregateExec { } /// Aggregate expressions - pub fn aggr_expr(&self) -> &[Arc] { + pub fn aggr_expr(&self) -> &[Arc] { &self.aggr_expr } @@ -788,7 +792,7 @@ impl ExecutionPlan for AggregateExec { fn create_schema( input_schema: &Schema, group_expr: &[(Arc, String)], - aggr_expr: &[Arc], + aggr_expr: &[Arc], contains_null_expr: bool, mode: AggregateMode, ) -> Result { @@ -834,7 +838,7 @@ fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef { /// /// # Parameters /// -/// - `aggr_expr`: A reference to an `Arc` representing the +/// - `aggr_expr`: A reference to an `Arc` representing the /// aggregate expression. /// - `group_by`: A reference to a `PhysicalGroupBy` instance representing the /// physical GROUP BY expression. @@ -846,7 +850,7 @@ fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef { /// A `LexOrdering` instance indicating the lexical ordering requirement for /// the aggregate expression. fn get_aggregate_expr_req( - aggr_expr: &Arc, + aggr_expr: &Arc, group_by: &PhysicalGroupBy, agg_mode: &AggregateMode, ) -> LexOrdering { @@ -894,7 +898,7 @@ fn get_aggregate_expr_req( /// the aggregator requirement is incompatible. fn finer_ordering( existing_req: &LexOrdering, - aggr_expr: &Arc, + aggr_expr: &Arc, group_by: &PhysicalGroupBy, eq_properties: &EquivalenceProperties, agg_mode: &AggregateMode, @@ -912,7 +916,7 @@ pub fn concat_slices(lhs: &[T], rhs: &[T]) -> Vec { /// /// # Parameters /// -/// - `aggr_exprs`: A slice of `Arc` containing all the +/// - `aggr_exprs`: A slice of `Arc` containing all the /// aggregate expressions. /// - `group_by`: A reference to a `PhysicalGroupBy` instance representing the /// physical GROUP BY expression. @@ -926,7 +930,7 @@ pub fn concat_slices(lhs: &[T], rhs: &[T]) -> Vec { /// A `LexRequirement` instance, which is the requirement that satisfies all the /// aggregate requirements. Returns an error in case of conflicting requirements. pub fn get_finer_aggregate_exprs_requirement( - aggr_exprs: &mut [Arc], + aggr_exprs: &mut [Arc], group_by: &PhysicalGroupBy, eq_properties: &EquivalenceProperties, agg_mode: &AggregateMode, @@ -996,10 +1000,10 @@ pub fn get_finer_aggregate_exprs_requirement( /// returns physical expressions for arguments to evaluate against a batch /// The expressions are different depending on `mode`: -/// * Partial: AggregateExpr::expressions -/// * Final: columns of `AggregateExpr::state_fields()` +/// * Partial: AggregateFunctionExpr::expressions +/// * Final: columns of `AggregateFunctionExpr::state_fields()` pub fn aggregate_expressions( - aggr_expr: &[Arc], + aggr_expr: &[Arc], mode: &AggregateMode, col_idx_base: usize, ) -> Result>>> { @@ -1035,12 +1039,12 @@ pub fn aggregate_expressions( } /// uses `state_fields` to build a vec of physical column expressions required to merge the -/// AggregateExpr' accumulator's state. +/// AggregateFunctionExpr' accumulator's state. /// /// `index_base` is the starting physical column index for the next expanded state field. fn merge_expressions( index_base: usize, - expr: &Arc, + expr: &Arc, ) -> Result>> { expr.state_fields().map(|fields| { fields @@ -1054,7 +1058,7 @@ fn merge_expressions( pub type AccumulatorItem = Box; pub fn create_accumulators( - aggr_expr: &[Arc], + aggr_expr: &[Arc], ) -> Result> { aggr_expr .iter() @@ -1218,8 +1222,8 @@ mod tests { use datafusion_physical_expr::PhysicalSortExpr; use crate::common::collect; + use datafusion_physical_expr::aggregate::AggregateExprBuilder; use datafusion_physical_expr::expressions::Literal; - use datafusion_physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; use futures::{FutureExt, Stream}; // Generate a schema which consists of 5 columns (a, b, c, d, e) @@ -1496,13 +1500,12 @@ mod tests { groups: vec![vec![false]], }; - let aggregates: Vec> = - vec![ - AggregateExprBuilder::new(avg_udaf(), vec![col("b", &input_schema)?]) - .schema(Arc::clone(&input_schema)) - .alias("AVG(b)") - .build()?, - ]; + let aggregates: Vec> = vec![ + AggregateExprBuilder::new(avg_udaf(), vec![col("b", &input_schema)?]) + .schema(Arc::clone(&input_schema)) + .alias("AVG(b)") + .build()?, + ]; let task_ctx = if spill { // set to an appropriate value to trigger spill @@ -1793,7 +1796,7 @@ mod tests { } // Median(a) - fn test_median_agg_expr(schema: SchemaRef) -> Result> { + fn test_median_agg_expr(schema: SchemaRef) -> Result> { AggregateExprBuilder::new(median_udaf(), vec![col("a", &schema)?]) .schema(schema) .alias("MEDIAN(a)") @@ -1819,17 +1822,16 @@ mod tests { }; // something that allocates within the aggregator - let aggregates_v0: Vec> = + let aggregates_v0: Vec> = vec![test_median_agg_expr(Arc::clone(&input_schema))?]; // use fast-path in `row_hash.rs`. - let aggregates_v2: Vec> = - vec![ - AggregateExprBuilder::new(avg_udaf(), vec![col("b", &input_schema)?]) - .schema(Arc::clone(&input_schema)) - .alias("AVG(b)") - .build()?, - ]; + let aggregates_v2: Vec> = vec![ + AggregateExprBuilder::new(avg_udaf(), vec![col("b", &input_schema)?]) + .schema(Arc::clone(&input_schema)) + .alias("AVG(b)") + .build()?, + ]; for (version, groups, aggregates) in [ (0, groups_none, aggregates_v0), @@ -1883,13 +1885,12 @@ mod tests { let groups = PhysicalGroupBy::default(); - let aggregates: Vec> = - vec![ - AggregateExprBuilder::new(avg_udaf(), vec![col("a", &schema)?]) - .schema(Arc::clone(&schema)) - .alias("AVG(a)") - .build()?, - ]; + let aggregates: Vec> = vec![ + AggregateExprBuilder::new(avg_udaf(), vec![col("a", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("AVG(a)") + .build()?, + ]; let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); let refs = blocking_exec.refs(); @@ -1923,13 +1924,12 @@ mod tests { let groups = PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]); - let aggregates: Vec> = - vec![ - AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?]) - .schema(Arc::clone(&schema)) - .alias("AVG(b)") - .build()?, - ]; + let aggregates: Vec> = vec![ + AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("AVG(b)") + .build()?, + ]; let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); let refs = blocking_exec.refs(); @@ -1974,7 +1974,7 @@ mod tests { fn test_first_value_agg_expr( schema: &Schema, sort_options: SortOptions, - ) -> Result> { + ) -> Result> { let ordering_req = [PhysicalSortExpr { expr: col("b", schema)?, options: sort_options, @@ -1992,7 +1992,7 @@ mod tests { fn test_last_value_agg_expr( schema: &Schema, sort_options: SortOptions, - ) -> Result> { + ) -> Result> { let ordering_req = [PhysicalSortExpr { expr: col("b", schema)?, options: sort_options, @@ -2047,7 +2047,7 @@ mod tests { descending: false, nulls_first: false, }; - let aggregates: Vec> = if is_first_acc { + let aggregates: Vec> = if is_first_acc { vec![test_first_value_agg_expr(&schema, sort_options)?] } else { vec![test_last_value_agg_expr(&schema, sort_options)?] @@ -2212,7 +2212,7 @@ mod tests { }; let groups = PhysicalGroupBy::new_single(vec![(col_a, "a".to_string())]); - let aggregates: Vec> = vec![ + let aggregates: Vec> = vec![ test_first_value_agg_expr(&schema, option_desc)?, test_last_value_agg_expr(&schema, option_desc)?, ]; @@ -2270,7 +2270,7 @@ mod tests { ], ); - let aggregates: Vec> = + let aggregates: Vec> = vec![AggregateExprBuilder::new(count_udaf(), vec![lit(1)]) .schema(Arc::clone(&schema)) .alias("1") diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 05f4ec621813..d022bb007d9b 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -47,10 +47,9 @@ use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_expr::{EmitTo, GroupsAccumulator}; use datafusion_physical_expr::expressions::Column; -use datafusion_physical_expr::{ - AggregateExpr, GroupsAccumulatorAdapter, PhysicalSortExpr, -}; +use datafusion_physical_expr::{GroupsAccumulatorAdapter, PhysicalSortExpr}; +use datafusion_physical_expr::aggregate::AggregateFunctionExpr; use futures::ready; use futures::stream::{Stream, StreamExt}; use log::debug; @@ -396,7 +395,7 @@ pub(crate) struct GroupedHashAggregateStream { /// processed. Reused across batches here to avoid reallocations current_group_indices: Vec, - /// Accumulators, one for each `AggregateExpr` in the query + /// Accumulators, one for each `AggregateFunctionExpr` in the query /// /// For example, if the query has aggregates, `SUM(x)`, /// `COUNT(y)`, there will be two accumulators, each one @@ -579,7 +578,7 @@ impl GroupedHashAggregateStream { /// that is supported by the aggregate, or a /// [`GroupsAccumulatorAdapter`] if not. pub(crate) fn create_group_accumulator( - agg_expr: &Arc, + agg_expr: &Arc, ) -> Result> { if agg_expr.groups_accumulator_supported() { agg_expr.create_groups_accumulator() diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index e1182719293d..c1c66f6d3923 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -34,7 +34,7 @@ pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; pub use datafusion_expr::{Accumulator, ColumnarValue}; pub use datafusion_physical_expr::window::WindowExpr; pub use datafusion_physical_expr::{ - expressions, functions, udf, AggregateExpr, Distribution, Partitioning, PhysicalExpr, + expressions, functions, udf, Distribution, Partitioning, PhysicalExpr, }; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexRequirement; diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index fb86a008e2cd..026798c5798b 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -31,7 +31,7 @@ pub use datafusion_expr::{Accumulator, ColumnarValue}; pub use datafusion_physical_expr::window::WindowExpr; use datafusion_physical_expr::PhysicalSortExpr; pub use datafusion_physical_expr::{ - expressions, functions, udf, AggregateExpr, Distribution, Partitioning, PhysicalExpr, + expressions, functions, udf, Distribution, Partitioning, PhysicalExpr, }; pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay}; @@ -82,7 +82,7 @@ pub mod windows; pub mod work_table; pub mod udaf { - pub use datafusion_physical_expr_functions_aggregate::aggregate::AggregateFunctionExpr; + pub use datafusion_physical_expr::aggregate::AggregateFunctionExpr; } pub mod coalesce; diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index d607bb79b44e..56823e6dec2d 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -37,14 +37,13 @@ use datafusion_expr::{ BuiltInWindowFunction, PartitionEvaluator, WindowFrame, WindowFunctionDefinition, WindowUDF, }; +use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr}; use datafusion_physical_expr::equivalence::collapse_lex_req; use datafusion_physical_expr::{ reverse_order_bys, window::{BuiltInWindowFunctionExpr, SlidingAggregateWindowExpr}, - AggregateExpr, ConstExpr, EquivalenceProperties, LexOrdering, - PhysicalSortRequirement, + ConstExpr, EquivalenceProperties, LexOrdering, PhysicalSortRequirement, }; -use datafusion_physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; use itertools::Itertools; mod bounded_window_agg_exec; @@ -142,7 +141,7 @@ fn window_expr_from_aggregate_expr( partition_by: &[Arc], order_by: &[PhysicalSortExpr], window_frame: Arc, - aggregate: Arc, + aggregate: Arc, ) -> Arc { // Is there a potentially unlimited sized window frame? let unbounded_window = window_frame.start_bound.is_unbounded(); diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 96fb45eafe62..78f370c714cc 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -18,7 +18,7 @@ use std::fmt::Debug; use std::sync::Arc; -use datafusion::physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; +use datafusion::physical_expr::aggregate::AggregateExprBuilder; use prost::bytes::BufMut; use prost::Message; @@ -34,6 +34,7 @@ use datafusion::datasource::physical_plan::ParquetExec; use datafusion::datasource::physical_plan::{AvroExec, CsvExec}; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::FunctionRegistry; +use datafusion::physical_expr::aggregate::AggregateFunctionExpr; use datafusion::physical_expr::{PhysicalExprRef, PhysicalSortRequirement}; use datafusion::physical_plan::aggregates::AggregateMode; use datafusion::physical_plan::aggregates::{AggregateExec, PhysicalGroupBy}; @@ -59,7 +60,7 @@ use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMerge use datafusion::physical_plan::union::{InterleaveExec, UnionExec}; use datafusion::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use datafusion::physical_plan::{ - AggregateExpr, ExecutionPlan, InputOrderMode, PhysicalExpr, WindowExpr, + ExecutionPlan, InputOrderMode, PhysicalExpr, WindowExpr, }; use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; use datafusion_expr::{AggregateUDF, ScalarUDF}; @@ -467,7 +468,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { }) .collect::, _>>()?; - let physical_aggr_expr: Vec> = hash_agg + let physical_aggr_expr: Vec> = hash_agg .aggr_expr .iter() .zip(hash_agg.aggr_expr_name.iter()) diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 7949a457f40f..555ad22a9bc1 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -29,7 +29,7 @@ use datafusion::physical_plan::expressions::{ }; use datafusion::physical_plan::udaf::AggregateFunctionExpr; use datafusion::physical_plan::windows::{BuiltInWindowExpr, PlainAggregateWindowExpr}; -use datafusion::physical_plan::{AggregateExpr, Partitioning, PhysicalExpr, WindowExpr}; +use datafusion::physical_plan::{Partitioning, PhysicalExpr, WindowExpr}; use datafusion::{ datasource::{ file_format::{csv::CsvSink, json::JsonSink}, @@ -49,58 +49,50 @@ use crate::protobuf::{ use super::PhysicalExtensionCodec; pub fn serialize_physical_aggr_expr( - aggr_expr: Arc, + aggr_expr: Arc, codec: &dyn PhysicalExtensionCodec, ) -> Result { let expressions = serialize_physical_exprs(aggr_expr.expressions(), codec)?; let ordering_req = aggr_expr.order_bys().unwrap_or(&[]).to_vec(); let ordering_req = serialize_physical_sort_exprs(ordering_req, codec)?; - if let Some(a) = aggr_expr.as_any().downcast_ref::() { - let name = a.fun().name().to_string(); - let mut buf = Vec::new(); - codec.try_encode_udaf(a.fun(), &mut buf)?; - Ok(protobuf::PhysicalExprNode { - expr_type: Some(protobuf::physical_expr_node::ExprType::AggregateExpr( - protobuf::PhysicalAggregateExprNode { - aggregate_function: Some(physical_aggregate_expr_node::AggregateFunction::UserDefinedAggrFunction(name)), - expr: expressions, - ordering_req, - distinct: a.is_distinct(), - ignore_nulls: a.ignore_nulls(), - fun_definition: (!buf.is_empty()).then_some(buf) - }, - )), - }) - } else { - unreachable!("No other types exists besides AggergationFunctionExpr"); - } + let name = aggr_expr.fun().name().to_string(); + let mut buf = Vec::new(); + codec.try_encode_udaf(aggr_expr.fun(), &mut buf)?; + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::AggregateExpr( + protobuf::PhysicalAggregateExprNode { + aggregate_function: Some(physical_aggregate_expr_node::AggregateFunction::UserDefinedAggrFunction(name)), + expr: expressions, + ordering_req, + distinct: aggr_expr.is_distinct(), + ignore_nulls: aggr_expr.ignore_nulls(), + fun_definition: (!buf.is_empty()).then_some(buf) + }, + )), + }) } fn serialize_physical_window_aggr_expr( - aggr_expr: &dyn AggregateExpr, + aggr_expr: &AggregateFunctionExpr, _window_frame: &WindowFrame, codec: &dyn PhysicalExtensionCodec, ) -> Result<(physical_window_expr_node::WindowFunction, Option>)> { - if let Some(a) = aggr_expr.as_any().downcast_ref::() { - if a.is_distinct() || a.ignore_nulls() { - // TODO - return not_impl_err!( - "Distinct aggregate functions not supported in window expressions" - ); - } - - let mut buf = Vec::new(); - codec.try_encode_udaf(a.fun(), &mut buf)?; - Ok(( - physical_window_expr_node::WindowFunction::UserDefinedAggrFunction( - a.fun().name().to_string(), - ), - (!buf.is_empty()).then_some(buf), - )) - } else { - unreachable!("No other types exists besides AggergationFunctionExpr"); + if aggr_expr.is_distinct() || aggr_expr.ignore_nulls() { + // TODO + return not_impl_err!( + "Distinct aggregate functions not supported in window expressions" + ); } + + let mut buf = Vec::new(); + codec.try_encode_udaf(aggr_expr.fun(), &mut buf)?; + Ok(( + physical_window_expr_node::WindowFunction::UserDefinedAggrFunction( + aggr_expr.fun().name().to_string(), + ), + (!buf.is_empty()).then_some(buf), + )) } pub fn serialize_physical_window_expr( diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 0ffc494321fb..60f5565bdeee 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -24,7 +24,7 @@ use std::vec; use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; -use datafusion::physical_expr_functions_aggregate::aggregate::AggregateExprBuilder; +use datafusion::physical_expr::aggregate::AggregateExprBuilder; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion_functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf; use datafusion_functions_aggregate::array_agg::array_agg_udaf; @@ -47,7 +47,6 @@ use datafusion::datasource::physical_plan::{ use datafusion::execution::FunctionRegistry; use datafusion::functions_aggregate::sum::sum_udaf; use datafusion::logical_expr::{create_udf, JoinType, Operator, Volatility}; -use datafusion::physical_expr::aggregate::utils::down_cast_any_ref; use datafusion::physical_expr::expressions::Literal; use datafusion::physical_expr::window::SlidingAggregateWindowExpr; use datafusion::physical_expr::{PhysicalSortRequirement, ScalarFunctionExpr}; @@ -70,13 +69,12 @@ use datafusion::physical_plan::placeholder_row::PlaceholderRowExec; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::repartition::RepartitionExec; use datafusion::physical_plan::sorts::sort::SortExec; +use datafusion::physical_plan::udaf::AggregateFunctionExpr; use datafusion::physical_plan::union::{InterleaveExec, UnionExec}; use datafusion::physical_plan::windows::{ BuiltInWindowExpr, PlainAggregateWindowExpr, WindowAggExec, }; -use datafusion::physical_plan::{ - AggregateExpr, ExecutionPlan, Partitioning, PhysicalExpr, Statistics, -}; +use datafusion::physical_plan::{ExecutionPlan, Partitioning, PhysicalExpr, Statistics}; use datafusion::prelude::SessionContext; use datafusion::scalar::ScalarValue; use datafusion_common::config::TableParquetOptions; @@ -362,7 +360,7 @@ fn rountrip_aggregate() -> Result<()> { .alias("NTH_VALUE(b, 1)") .build()?; - let test_cases: Vec>> = vec![ + let test_cases: Vec>> = vec![ // AVG vec![avg_expr], // NTH_VALUE @@ -395,7 +393,7 @@ fn rountrip_aggregate_with_limit() -> Result<()> { let groups: Vec<(Arc, String)> = vec![(col("a", &schema)?, "unused".to_string())]; - let aggregates: Vec> = + let aggregates: Vec> = vec![ AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?]) .schema(Arc::clone(&schema)) @@ -424,7 +422,7 @@ fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> { let groups: Vec<(Arc, String)> = vec![(col("a", &schema)?, "unused".to_string())]; - let aggregates: Vec> = vec![AggregateExprBuilder::new( + let aggregates: Vec> = vec![AggregateExprBuilder::new( approx_percentile_cont_udaf(), vec![col("b", &schema)?, lit(0.5)], ) @@ -459,7 +457,7 @@ fn rountrip_aggregate_with_sort() -> Result<()> { }, }]; - let aggregates: Vec> = + let aggregates: Vec> = vec![ AggregateExprBuilder::new(array_agg_udaf(), vec![col("b", &schema)?]) .schema(Arc::clone(&schema)) @@ -526,7 +524,7 @@ fn roundtrip_aggregate_udaf() -> Result<()> { let groups: Vec<(Arc, String)> = vec![(col("a", &schema)?, "unused".to_string())]; - let aggregates: Vec> = + let aggregates: Vec> = vec![ AggregateExprBuilder::new(Arc::new(udaf), vec![col("b", &schema)?]) .schema(Arc::clone(&schema)) @@ -748,7 +746,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { } impl PartialEq for CustomPredicateExpr { fn eq(&self, other: &dyn Any) -> bool { - down_cast_any_ref(other) + other .downcast_ref::() .map(|x| self.inner.eq(&x.inner)) .unwrap_or(false) From 58ed702c8f7f2536a8f71cf8b7486812a3afa42d Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 27 Aug 2024 15:17:23 +0200 Subject: [PATCH 15/23] Deduplicate sort unparsing logic (#12175) * Deduplicate sort unparsing logic Reconstruction of `ast::OrderByExpr` was implemented twice, in `plan.rs` and `expr.rs` submodules of the unparser. * empty --- datafusion/sql/src/unparser/plan.rs | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 8b5a5b0942b8..106705c322fc 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -528,24 +528,10 @@ impl Unparser<'_> { fn sort_to_sql(&self, sort_exprs: Vec) -> Result> { sort_exprs .iter() - .map(|expr: &Expr| match expr { - Expr::Sort(sort_expr) => { - let col = self.expr_to_sql(&sort_expr.expr)?; - - let nulls_first = if self.dialect.supports_nulls_first_in_sort() { - Some(sort_expr.nulls_first) - } else { - None - }; - - Ok(ast::OrderByExpr { - asc: Some(sort_expr.asc), - expr: col, - nulls_first, - with_fill: None, - }) - } - _ => plan_err!("Expecting Sort expr"), + .map(|expr: &Expr| { + self.expr_to_unparsed(expr)? + .into_order_by_expr() + .or(plan_err!("Expecting Sort expr")) }) .collect::>>() } From 4a943560e1a24108aeb915be260db7c4264c9b11 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 27 Aug 2024 15:22:44 +0200 Subject: [PATCH 16/23] Require sort expressions to be of type Sort (#12171) * Require sort expressions to be of type Sort in compare_sort_expr * fixup! Require sort expressions to be of type Sort in compare_sort_expr * fixup! Require sort expressions to be of type Sort in compare_sort_expr --- datafusion/expr/src/utils.rs | 2 +- datafusion/sql/src/expr/function.rs | 4 +++- datafusion/sql/src/unparser/rewrite.rs | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 8f04d3ec3067..a01d5ef8973a 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -569,7 +569,7 @@ pub fn compare_sort_expr( } Ordering::Equal } - _ => Ordering::Equal, + _ => panic!("Sort expressions must be of type Sort"), } } diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index b95414a8cafd..72e08e4b8fb5 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -295,8 +295,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Some(false) }; } + Some(false) + } else { + panic!("order_by expression must be of type Sort"); } - Some(false) }); let window_frame = window diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs index fed3f2c39f9d..9e1adcf4df31 100644 --- a/datafusion/sql/src/unparser/rewrite.rs +++ b/datafusion/sql/src/unparser/rewrite.rs @@ -161,6 +161,8 @@ pub(super) fn rewrite_plan_for_sort_on_non_projected_fields( for expr in &sort.expr { if let Expr::Sort(s) = expr { collects.push(s.expr.as_ref().clone()); + } else { + panic!("sort expression must be of type Sort"); } } From a616e8895bdde48153d70ba32281d5174d172cea Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 27 Aug 2024 15:24:11 +0200 Subject: [PATCH 17/23] Remove `parse_vec_expr` helper (#12178) * Remove `parse_vec_expr` helper Behavior of `parse_vec_expr` and `parse_exprs` is almost similar -- both take a collection of expressions to parse. The only difference is that `parse_vec_expr` returns `Option::None` when collections is empty, but this difference in behavior does not correspond to difference in function names. Since the function is used once only, remove it instead of coming up with a fancy name. * empty --- datafusion/proto/src/logical_plan/from_proto.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index b74237b5281b..acda1298dd80 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -586,7 +586,10 @@ pub fn parse_expr( parse_exprs(&pb.args, registry, codec)?, pb.distinct, parse_optional_expr(pb.filter.as_deref(), registry, codec)?.map(Box::new), - parse_vec_expr(&pb.order_by, registry, codec)?, + match pb.order_by.len() { + 0 => None, + _ => Some(parse_exprs(&pb.order_by, registry, codec)?), + }, None, ))) } @@ -676,16 +679,6 @@ pub fn from_proto_binary_op(op: &str) -> Result { } } -fn parse_vec_expr( - p: &[protobuf::LogicalExprNode], - registry: &dyn FunctionRegistry, - codec: &dyn LogicalExtensionCodec, -) -> Result>, Error> { - let res = parse_exprs(p, registry, codec)?; - // Convert empty vector to None. - Ok((!res.is_empty()).then_some(res)) -} - fn parse_optional_expr( p: Option<&protobuf::LogicalExprNode>, registry: &dyn FunctionRegistry, From bf46f33f0746598c69ad6651c11f36842c5e71a2 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 27 Aug 2024 15:25:23 +0200 Subject: [PATCH 18/23] Reuse bulk serialization helpers for protobuf (#12179) Reuse `parse_exprs` and `serialize_exprs`. Reduce code duplication, increasing readability. --- datafusion/proto/src/logical_plan/mod.rs | 205 +++++++---------------- 1 file changed, 60 insertions(+), 145 deletions(-) diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index bc019725f36c..67977b1795a6 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -29,7 +29,7 @@ use crate::{ }, }; -use crate::protobuf::{proto_error, FromProtoError, ToProtoError}; +use crate::protobuf::{proto_error, ToProtoError}; use arrow::datatypes::{DataType, Schema, SchemaRef}; #[cfg(feature = "parquet")] use datafusion::datasource::file_format::parquet::ParquetFormat; @@ -66,11 +66,10 @@ use datafusion_expr::{ }; use datafusion_expr::{AggregateUDF, Unnest}; +use self::to_proto::{serialize_expr, serialize_exprs}; use prost::bytes::BufMut; use prost::Message; -use self::to_proto::serialize_expr; - pub mod file_formats; pub mod from_proto; pub mod to_proto; @@ -273,13 +272,7 @@ impl AsLogicalPlan for LogicalPlanNode { values .values_list .chunks_exact(n_cols) - .map(|r| { - r.iter() - .map(|expr| { - from_proto::parse_expr(expr, ctx, extension_codec) - }) - .collect::, FromProtoError>>() - }) + .map(|r| from_proto::parse_exprs(r, ctx, extension_codec)) .collect::, _>>() .map_err(|e| e.into()) }?; @@ -288,11 +281,8 @@ impl AsLogicalPlan for LogicalPlanNode { LogicalPlanType::Projection(projection) => { let input: LogicalPlan = into_logical_plan!(projection.input, ctx, extension_codec)?; - let expr: Vec = projection - .expr - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; + let expr: Vec = + from_proto::parse_exprs(&projection.expr, ctx, extension_codec)?; let new_proj = project(input, expr)?; match projection.optional_alias.as_ref() { @@ -324,26 +314,17 @@ impl AsLogicalPlan for LogicalPlanNode { LogicalPlanType::Window(window) => { let input: LogicalPlan = into_logical_plan!(window.input, ctx, extension_codec)?; - let window_expr = window - .window_expr - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; + let window_expr = + from_proto::parse_exprs(&window.window_expr, ctx, extension_codec)?; LogicalPlanBuilder::from(input).window(window_expr)?.build() } LogicalPlanType::Aggregate(aggregate) => { let input: LogicalPlan = into_logical_plan!(aggregate.input, ctx, extension_codec)?; - let group_expr = aggregate - .group_expr - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; - let aggr_expr = aggregate - .aggr_expr - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; + let group_expr = + from_proto::parse_exprs(&aggregate.group_expr, ctx, extension_codec)?; + let aggr_expr = + from_proto::parse_exprs(&aggregate.aggr_expr, ctx, extension_codec)?; LogicalPlanBuilder::from(input) .aggregate(group_expr, aggr_expr)? .build() @@ -361,20 +342,16 @@ impl AsLogicalPlan for LogicalPlanNode { projection = Some(column_indices); } - let filters = scan - .filters - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; + let filters = + from_proto::parse_exprs(&scan.filters, ctx, extension_codec)?; let mut all_sort_orders = vec![]; for order in &scan.file_sort_order { - let file_sort_order = order - .logical_expr_nodes - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; - all_sort_orders.push(file_sort_order) + all_sort_orders.push(from_proto::parse_exprs( + &order.logical_expr_nodes, + ctx, + extension_codec, + )?) } let file_format: Arc = @@ -475,11 +452,8 @@ impl AsLogicalPlan for LogicalPlanNode { projection = Some(column_indices); } - let filters = scan - .filters - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; + let filters = + from_proto::parse_exprs(&scan.filters, ctx, extension_codec)?; let table_name = from_table_reference(scan.table_name.as_ref(), "CustomScan")?; @@ -502,11 +476,8 @@ impl AsLogicalPlan for LogicalPlanNode { LogicalPlanType::Sort(sort) => { let input: LogicalPlan = into_logical_plan!(sort.input, ctx, extension_codec)?; - let sort_expr: Vec = sort - .expr - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; + let sort_expr: Vec = + from_proto::parse_exprs(&sort.expr, ctx, extension_codec)?; LogicalPlanBuilder::from(input).sort(sort_expr)?.build() } LogicalPlanType::Repartition(repartition) => { @@ -525,12 +496,7 @@ impl AsLogicalPlan for LogicalPlanNode { hash_expr: pb_hash_expr, partition_count, }) => Partitioning::Hash( - pb_hash_expr - .iter() - .map(|expr| { - from_proto::parse_expr(expr, ctx, extension_codec) - }) - .collect::, _>>()?, + from_proto::parse_exprs(pb_hash_expr, ctx, extension_codec)?, *partition_count as usize, ), PartitionMethod::RoundRobin(partition_count) => { @@ -570,12 +536,11 @@ impl AsLogicalPlan for LogicalPlanNode { let mut order_exprs = vec![]; for expr in &create_extern_table.order_exprs { - let order_expr = expr - .logical_expr_nodes - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; - order_exprs.push(order_expr) + order_exprs.push(from_proto::parse_exprs( + &expr.logical_expr_nodes, + ctx, + extension_codec, + )?); } let mut column_defaults = @@ -693,16 +658,10 @@ impl AsLogicalPlan for LogicalPlanNode { LogicalPlanBuilder::from(input).limit(skip, fetch)?.build() } LogicalPlanType::Join(join) => { - let left_keys: Vec = join - .left_join_key - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; - let right_keys: Vec = join - .right_join_key - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; + let left_keys: Vec = + from_proto::parse_exprs(&join.left_join_key, ctx, extension_codec)?; + let right_keys: Vec = + from_proto::parse_exprs(&join.right_join_key, ctx, extension_codec)?; let join_type = protobuf::JoinType::try_from(join.join_type).map_err(|_| { proto_error(format!( @@ -804,27 +763,20 @@ impl AsLogicalPlan for LogicalPlanNode { LogicalPlanType::DistinctOn(distinct_on) => { let input: LogicalPlan = into_logical_plan!(distinct_on.input, ctx, extension_codec)?; - let on_expr = distinct_on - .on_expr - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; - let select_expr = distinct_on - .select_expr - .iter() - .map(|expr| from_proto::parse_expr(expr, ctx, extension_codec)) - .collect::, _>>()?; + let on_expr = + from_proto::parse_exprs(&distinct_on.on_expr, ctx, extension_codec)?; + let select_expr = from_proto::parse_exprs( + &distinct_on.select_expr, + ctx, + extension_codec, + )?; let sort_expr = match distinct_on.sort_expr.len() { 0 => None, - _ => Some( - distinct_on - .sort_expr - .iter() - .map(|expr| { - from_proto::parse_expr(expr, ctx, extension_codec) - }) - .collect::, _>>()?, - ), + _ => Some(from_proto::parse_exprs( + &distinct_on.sort_expr, + ctx, + extension_codec, + )?), }; LogicalPlanBuilder::from(input) .distinct_on(on_expr, select_expr, sort_expr)? @@ -943,11 +895,8 @@ impl AsLogicalPlan for LogicalPlanNode { } else { values[0].len() } as u64; - let values_list = values - .iter() - .flatten() - .map(|v| serialize_expr(v, extension_codec)) - .collect::, _>>()?; + let values_list = + serialize_exprs(values.iter().flatten(), extension_codec)?; Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::Values( protobuf::ValuesNode { @@ -982,10 +931,8 @@ impl AsLogicalPlan for LogicalPlanNode { }; let schema: protobuf::Schema = schema.as_ref().try_into()?; - let filters: Vec = filters - .iter() - .map(|filter| serialize_expr(filter, extension_codec)) - .collect::, _>>()?; + let filters: Vec = + serialize_exprs(filters, extension_codec)?; if let Some(listing_table) = source.downcast_ref::() { let any = listing_table.options().format.as_any(); @@ -1037,10 +984,7 @@ impl AsLogicalPlan for LogicalPlanNode { let mut exprs_vec: Vec = vec![]; for order in &options.file_sort_order { let expr_vec = LogicalExprNodeCollection { - logical_expr_nodes: order - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, ToProtoError>>()?, + logical_expr_nodes: serialize_exprs(order, extension_codec)?, }; exprs_vec.push(expr_vec); } @@ -1118,10 +1062,7 @@ impl AsLogicalPlan for LogicalPlanNode { extension_codec, )?, )), - expr: expr - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, ToProtoError>>()?, + expr: serialize_exprs(expr, extension_codec)?, optional_alias: None, }, ))), @@ -1173,22 +1114,13 @@ impl AsLogicalPlan for LogicalPlanNode { )?; let sort_expr = match sort_expr { None => vec![], - Some(sort_expr) => sort_expr - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, _>>()?, + Some(sort_expr) => serialize_exprs(sort_expr, extension_codec)?, }; Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::DistinctOn(Box::new( protobuf::DistinctOnNode { - on_expr: on_expr - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, _>>()?, - select_expr: select_expr - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, _>>()?, + on_expr: serialize_exprs(on_expr, extension_codec)?, + select_expr: serialize_exprs(select_expr, extension_codec)?, sort_expr, input: Some(Box::new(input)), }, @@ -1207,10 +1139,7 @@ impl AsLogicalPlan for LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::Window(Box::new( protobuf::WindowNode { input: Some(Box::new(input)), - window_expr: window_expr - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, _>>()?, + window_expr: serialize_exprs(window_expr, extension_codec)?, }, ))), }) @@ -1230,14 +1159,8 @@ impl AsLogicalPlan for LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::Aggregate(Box::new( protobuf::AggregateNode { input: Some(Box::new(input)), - group_expr: group_expr - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, _>>()?, - aggr_expr: aggr_expr - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, _>>()?, + group_expr: serialize_exprs(group_expr, extension_codec)?, + aggr_expr: serialize_exprs(aggr_expr, extension_codec)?, }, ))), }) @@ -1335,10 +1258,8 @@ impl AsLogicalPlan for LogicalPlanNode { input.as_ref(), extension_codec, )?; - let selection_expr: Vec = expr - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, ToProtoError>>()?; + let selection_expr: Vec = + serialize_exprs(expr, extension_codec)?; Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::Sort(Box::new( protobuf::SortNode { @@ -1367,10 +1288,7 @@ impl AsLogicalPlan for LogicalPlanNode { let pb_partition_method = match partitioning_scheme { Partitioning::Hash(exprs, partition_count) => { PartitionMethod::Hash(protobuf::HashRepartition { - hash_expr: exprs - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, ToProtoError>>()?, + hash_expr: serialize_exprs(exprs, extension_codec)?, partition_count: *partition_count as u64, }) } @@ -1419,10 +1337,7 @@ impl AsLogicalPlan for LogicalPlanNode { let mut converted_order_exprs: Vec = vec![]; for order in order_exprs { let temp = LogicalExprNodeCollection { - logical_expr_nodes: order - .iter() - .map(|expr| serialize_expr(expr, extension_codec)) - .collect::, ToProtoError>>()?, + logical_expr_nodes: serialize_exprs(order, extension_codec)?, }; converted_order_exprs.push(temp); } From 66bc222c53e98e65f8b6811cffbaf5de8bdf1985 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 27 Aug 2024 15:25:44 +0200 Subject: [PATCH 19/23] Make it easier to understand datafusion-cli exclusion (#12188) --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 479d2cadc65b..50d234c39576 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ # under the License. [workspace] +# datafusion-cli is excluded because of its Cargo.lock. See datafusion-cli/README.md. exclude = ["datafusion-cli", "dev/depcheck"] members = [ "datafusion/common", From 6ffb1f64a7b3f48c7693d2ecb34f72a2dca24acc Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 27 Aug 2024 17:52:01 +0200 Subject: [PATCH 20/23] Remove unnecessary clones from `.../logical_plan/builder.rs` (#12196) --- datafusion/expr/src/logical_plan/builder.rs | 29 +++++++++------------ 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 6abcfe4279bc..9894fe887de0 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -211,7 +211,7 @@ impl LogicalPlanBuilder { }; common_type = Some(new_type); } else { - common_type = Some(data_type.clone()); + common_type = Some(data_type); } } field_types.push(common_type.unwrap_or(DataType::Utf8)); @@ -220,7 +220,7 @@ impl LogicalPlanBuilder { for row in &mut values { for (j, field_type) in field_types.iter().enumerate() { if let Expr::Literal(ScalarValue::Null) = row[j] { - row[j] = Expr::Literal(ScalarValue::try_from(field_type.clone())?); + row[j] = Expr::Literal(ScalarValue::try_from(field_type)?); } else { row[j] = std::mem::take(&mut row[j]).cast_to(field_type, &empty_schema)?; @@ -552,20 +552,17 @@ impl LogicalPlanBuilder { // Collect sort columns that are missing in the input plan's schema let mut missing_cols: Vec = vec![]; - exprs - .clone() - .into_iter() - .try_for_each::<_, Result<()>>(|expr| { - let columns = expr.column_refs(); + exprs.iter().try_for_each::<_, Result<()>>(|expr| { + let columns = expr.column_refs(); - columns.into_iter().for_each(|c| { - if !schema.has_column(c) { - missing_cols.push(c.clone()); - } - }); + columns.into_iter().for_each(|c| { + if !schema.has_column(c) { + missing_cols.push(c.clone()); + } + }); - Ok(()) - })?; + Ok(()) + })?; if missing_cols.is_empty() { return Ok(Self::new(LogicalPlan::Sort(Sort { @@ -710,7 +707,7 @@ impl LogicalPlanBuilder { pub(crate) fn normalize( plan: &LogicalPlan, - column: impl Into + Clone, + column: impl Into, ) -> Result { let schema = plan.schema(); let fallback_schemas = plan.fallback_normalize_schemas(); @@ -1536,7 +1533,7 @@ pub fn get_unnested_columns( | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => { let new_field = Arc::new(Field::new( - col_name.clone(), + col_name, field.data_type().clone(), // Unnesting may produce NULLs even if the list is not null. // For example: unnset([1], []) -> 1, null From 8ba6732af5f4f32cbe0a23ef6bc2f393c640898b Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Wed, 28 Aug 2024 09:47:41 +0200 Subject: [PATCH 21/23] Remove unnecessary clones with clippy (#12197) * Remove unnecessary clones with clippy This is automated change done with ``` cargo clippy --fix -- -Aclippy::all -Wclippy::redundant_clone cargo fmt # manually fix few new clippy errors introduced by clippy itself ``` Note: it doesn't remove all unnecessary clones because the command reported error and backed out for the common crate. * Prevent clippy from removing intentional clone clippy can be run with `--fix` and then it won't obey the code comment instructing not to delete the clone. * Remove unnecessary clones pointed out by clippy Change code as instructed by ``` cargo clippy --fix -- -Aclippy::all -Wclippy::redundant_clone ``` where clippy didn't apply the suggested changes by itself. --- .../examples/advanced_parquet_index.rs | 2 +- .../examples/file_stream_provider.rs | 2 +- datafusion/common/src/dfschema.rs | 3 +- .../common/src/file_options/parquet_writer.rs | 2 +- datafusion/common/src/scalar/mod.rs | 8 ++-- datafusion/common/src/stats.rs | 1 + .../core/src/datasource/listing/helpers.rs | 2 +- .../core/src/datasource/listing/table.rs | 2 +- .../datasource/physical_plan/file_groups.rs | 7 +--- .../physical_plan/file_scan_config.rs | 4 +- .../core/src/datasource/schema_adapter.rs | 2 +- datafusion/core/src/execution/context/mod.rs | 2 +- .../enforce_distribution.rs | 23 +++++------ .../src/physical_optimizer/join_selection.rs | 21 +++++----- .../physical_optimizer/projection_pushdown.rs | 18 +++------ .../core/src/physical_optimizer/pruning.rs | 1 - datafusion/core/src/physical_planner.rs | 4 +- datafusion/core/tests/fuzz_cases/join_fuzz.rs | 10 ++--- .../physical_optimizer/limit_pushdown.rs | 16 ++++---- .../limited_distinct_aggregation.rs | 22 +++++----- .../user_defined/user_defined_aggregates.rs | 2 +- .../user_defined_scalar_functions.rs | 2 +- datafusion/execution/src/memory_pool/pool.rs | 4 +- .../expr-common/src/interval_arithmetic.rs | 10 ++--- .../expr-common/src/type_coercion/binary.rs | 2 +- datafusion/expr/src/expr_schema.rs | 4 +- datafusion/expr/src/logical_plan/builder.rs | 2 +- datafusion/expr/src/logical_plan/display.rs | 20 ++++------ .../aggregate/groups_accumulator/prim_op.rs | 2 +- .../src/approx_percentile_cont.rs | 4 +- .../functions-aggregate/src/array_agg.rs | 7 +--- .../functions-aggregate/src/nth_value.rs | 7 +--- datafusion/functions-nested/src/flatten.rs | 2 +- datafusion/functions-nested/src/utils.rs | 3 +- .../optimizer/src/common_subexpr_eliminate.rs | 12 +++--- .../optimizer/src/eliminate_nested_union.rs | 17 +++----- .../optimizer/src/eliminate_one_union.rs | 5 +-- .../optimizer/src/optimize_projections/mod.rs | 4 +- datafusion/optimizer/src/push_down_filter.rs | 2 +- .../simplify_expressions/expr_simplifier.rs | 26 ++++++------ .../src/simplify_expressions/guarantees.rs | 4 +- .../src/unwrap_cast_in_comparison.rs | 2 +- .../physical-expr/src/equivalence/ordering.rs | 2 +- .../physical-expr/src/expressions/binary.rs | 4 +- .../physical-expr/src/expressions/case.rs | 2 +- .../physical-plan/src/aggregates/mod.rs | 2 +- datafusion/physical-plan/src/joins/utils.rs | 2 +- .../physical-plan/src/recursive_query.rs | 2 +- .../src/windows/bounded_window_agg_exec.rs | 4 +- datafusion/proto/gen/src/main.rs | 2 +- .../tests/cases/roundtrip_logical_plan.rs | 2 +- .../tests/cases/roundtrip_physical_plan.rs | 40 +++++++++---------- datafusion/sql/src/expr/function.rs | 3 +- datafusion/sql/src/relation/mod.rs | 2 +- datafusion/sql/src/unparser/expr.rs | 2 +- datafusion/sql/tests/cases/plan_to_sql.rs | 2 +- 56 files changed, 162 insertions(+), 204 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 903defafe3ab..f6860bb5b87a 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -300,7 +300,7 @@ impl IndexTableProvider { // analyze the predicate. In a real system, using // `PruningPredicate::prune` would likely be easier to do. let pruning_predicate = - PruningPredicate::try_new(Arc::clone(predicate), self.schema().clone())?; + PruningPredicate::try_new(Arc::clone(predicate), self.schema())?; // The PruningPredicate's guarantees must all be satisfied in order for // the predicate to possibly evaluate to true. diff --git a/datafusion-examples/examples/file_stream_provider.rs b/datafusion-examples/examples/file_stream_provider.rs index b8549bd6b6e6..4db7e0200f53 100644 --- a/datafusion-examples/examples/file_stream_provider.rs +++ b/datafusion-examples/examples/file_stream_provider.rs @@ -100,7 +100,7 @@ mod non_windows { ) { // Timeout for a long period of BrokenPipe error let broken_pipe_timeout = Duration::from_secs(10); - let sa = file_path.clone(); + let sa = file_path; // Spawn a new thread to write to the FIFO file #[allow(clippy::disallowed_methods)] // spawn allowed only in tests tasks.spawn_blocking(move || { diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index f0eecd2ffeb1..095f4c510194 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -1242,10 +1242,9 @@ mod tests { #[test] fn into() { // Demonstrate how to convert back and forth between Schema, SchemaRef, DFSchema, and DFSchemaRef - let metadata = test_metadata(); let arrow_schema = Schema::new_with_metadata( vec![Field::new("c0", DataType::Int64, true)], - metadata.clone(), + test_metadata(), ); let arrow_schema_ref = Arc::new(arrow_schema.clone()); diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 4a229fe01b54..e42fb96ed6a5 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -565,7 +565,7 @@ mod tests { column_options_with_non_defaults(&parquet_options), )] .into(), - key_value_metadata: [(key.clone(), value.clone())].into(), + key_value_metadata: [(key, value)].into(), }; let writer_props = WriterPropertiesBuilder::try_from(&table_parquet_opts) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 677685b2c65b..5acc2b6f188e 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -4905,7 +4905,7 @@ mod tests { let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); - assert_eq!(non_null_list_scalar.data_type(), data_type.clone()); + assert_eq!(non_null_list_scalar.data_type(), data_type); assert_eq!(null_list_scalar.data_type(), data_type); } @@ -5582,13 +5582,13 @@ mod tests { // Define list-of-structs scalars - let nl0_array = ScalarValue::iter_to_array(vec![s0.clone(), s1.clone()]).unwrap(); + let nl0_array = ScalarValue::iter_to_array(vec![s0, s1.clone()]).unwrap(); let nl0 = ScalarValue::List(Arc::new(array_into_list_array_nullable(nl0_array))); - let nl1_array = ScalarValue::iter_to_array(vec![s2.clone()]).unwrap(); + let nl1_array = ScalarValue::iter_to_array(vec![s2]).unwrap(); let nl1 = ScalarValue::List(Arc::new(array_into_list_array_nullable(nl1_array))); - let nl2_array = ScalarValue::iter_to_array(vec![s1.clone()]).unwrap(); + let nl2_array = ScalarValue::iter_to_array(vec![s1]).unwrap(); let nl2 = ScalarValue::List(Arc::new(array_into_list_array_nullable(nl2_array))); // iter_to_array for list-of-struct diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 50ae4e3ca71f..d8e62b3045f9 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -557,6 +557,7 @@ mod tests { let precision: Precision = Precision::Exact(ScalarValue::Int64(Some(42))); // Clippy would complain about this if it were Copy + #[allow(clippy::redundant_clone)] let p2 = precision.clone(); assert_eq!(precision, p2); } diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index b5dd2dd12e10..f6e938b72dab 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -282,7 +282,7 @@ async fn prune_partitions( Default::default(), )?; - let batch = RecordBatch::try_new(schema.clone(), arrays)?; + let batch = RecordBatch::try_new(schema, arrays)?; // TODO: Plumb this down let props = ExecutionProps::new(); diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 89066d8234ac..a0345a38e40c 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1016,7 +1016,7 @@ impl ListingTable { .collected_statistics .get_with_extra(&part_file.object_meta.location, &part_file.object_meta) { - Some(statistics) => Ok(statistics.clone()), + Some(statistics) => Ok(statistics), None => { let statistics = self .options diff --git a/datafusion/core/src/datasource/physical_plan/file_groups.rs b/datafusion/core/src/datasource/physical_plan/file_groups.rs index fb2cd4ad06ec..28f975ae193d 100644 --- a/datafusion/core/src/datasource/physical_plan/file_groups.rs +++ b/datafusion/core/src/datasource/physical_plan/file_groups.rs @@ -394,7 +394,7 @@ mod test { #[test] fn repartition_empty_file_only() { let partitioned_file_empty = pfile("empty", 0); - let file_group = vec![vec![partitioned_file_empty.clone()]]; + let file_group = vec![vec![partitioned_file_empty]]; let partitioned_files = FileGroupPartitioner::new() .with_target_partitions(4) @@ -817,10 +817,7 @@ mod test { .with_preserve_order_within_groups(true) .repartition_file_groups(&file_groups); - assert_partitioned_files( - repartitioned.clone(), - repartitioned_preserving_sort.clone(), - ); + assert_partitioned_files(repartitioned.clone(), repartitioned_preserving_sort); repartitioned } } diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 34fb6226c1a2..bfa5488e5b5e 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -908,7 +908,7 @@ mod tests { schema.clone(), Some(vec![0, 3, 5, schema.fields().len()]), Statistics::new_unknown(&schema), - to_partition_cols(partition_cols.clone()), + to_partition_cols(partition_cols), ) .projected_file_schema(); @@ -941,7 +941,7 @@ mod tests { schema.clone(), None, Statistics::new_unknown(&schema), - to_partition_cols(partition_cols.clone()), + to_partition_cols(partition_cols), ) .projected_file_schema(); diff --git a/datafusion/core/src/datasource/schema_adapter.rs b/datafusion/core/src/datasource/schema_adapter.rs index 40cb40a83af2..5d2d0ff91b15 100644 --- a/datafusion/core/src/datasource/schema_adapter.rs +++ b/datafusion/core/src/datasource/schema_adapter.rs @@ -369,7 +369,7 @@ mod tests { let f1 = Field::new("id", DataType::Int32, true); let f2 = Field::new("extra_column", DataType::Utf8, true); - let schema = Arc::new(Schema::new(vec![f1.clone(), f2.clone()])); + let schema = Arc::new(Schema::new(vec![f1, f2])); let extra_column = Arc::new(StringArray::from(vec!["foo"])); let mut new_columns = batch.columns().to_vec(); diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index c2a072eef756..18943599b136 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1165,7 +1165,7 @@ impl SessionContext { // check schema uniqueness let mut batches = batches.into_iter().peekable(); let schema = if let Some(batch) = batches.peek() { - batch.schema().clone() + batch.schema() } else { Arc::new(Schema::empty()) }; diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 77ddd098c7be..ba6f7d0439c2 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -3908,7 +3908,7 @@ pub(crate) mod tests { let alias = vec![("a".to_string(), "a".to_string())]; let plan_parquet = aggregate_exec_with_alias(parquet_exec_multiple(), alias.clone()); - let plan_csv = aggregate_exec_with_alias(csv_exec_multiple(), alias.clone()); + let plan_csv = aggregate_exec_with_alias(csv_exec_multiple(), alias); let expected_parquet = [ "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", @@ -3934,7 +3934,7 @@ pub(crate) mod tests { let alias = vec![("a".to_string(), "a".to_string())]; let plan_parquet = aggregate_exec_with_alias(parquet_exec_multiple(), alias.clone()); - let plan_csv = aggregate_exec_with_alias(csv_exec_multiple(), alias.clone()); + let plan_csv = aggregate_exec_with_alias(csv_exec_multiple(), alias); let expected_parquet = [ "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", @@ -3964,7 +3964,7 @@ pub(crate) mod tests { options: SortOptions::default(), }]; let plan_parquet = limit_exec(sort_exec(sort_key.clone(), parquet_exec(), false)); - let plan_csv = limit_exec(sort_exec(sort_key.clone(), csv_exec(), false)); + let plan_csv = limit_exec(sort_exec(sort_key, csv_exec(), false)); let expected_parquet = &[ "GlobalLimitExec: skip=0, fetch=100", @@ -4000,8 +4000,7 @@ pub(crate) mod tests { parquet_exec(), false, ))); - let plan_csv = - limit_exec(filter_exec(sort_exec(sort_key.clone(), csv_exec(), false))); + let plan_csv = limit_exec(filter_exec(sort_exec(sort_key, csv_exec(), false))); let expected_parquet = &[ "GlobalLimitExec: skip=0, fetch=100", @@ -4042,7 +4041,7 @@ pub(crate) mod tests { ); let plan_csv = aggregate_exec_with_alias( limit_exec(filter_exec(limit_exec(csv_exec()))), - alias.clone(), + alias, ); let expected_parquet = &[ @@ -4126,7 +4125,7 @@ pub(crate) mod tests { ); let plan_csv = sort_preserving_merge_exec( sort_key.clone(), - csv_exec_with_sort(vec![sort_key.clone()]), + csv_exec_with_sort(vec![sort_key]), ); // parallelization is not beneficial for SortPreservingMerge @@ -4154,7 +4153,7 @@ pub(crate) mod tests { union_exec(vec![parquet_exec_with_sort(vec![sort_key.clone()]); 2]); let input_csv = union_exec(vec![csv_exec_with_sort(vec![sort_key.clone()]); 2]); let plan_parquet = sort_preserving_merge_exec(sort_key.clone(), input_parquet); - let plan_csv = sort_preserving_merge_exec(sort_key.clone(), input_csv); + let plan_csv = sort_preserving_merge_exec(sort_key, input_csv); // should not repartition (union doesn't benefit from increased parallelism) // should not sort (as the data was already sorted) @@ -4224,8 +4223,8 @@ pub(crate) mod tests { ("c".to_string(), "c2".to_string()), ]; let proj_parquet = projection_exec_with_alias( - parquet_exec_with_sort(vec![sort_key.clone()]), - alias_pairs.clone(), + parquet_exec_with_sort(vec![sort_key]), + alias_pairs, ); let sort_key_after_projection = vec![PhysicalSortExpr { expr: col("c2", &proj_parquet.schema()).unwrap(), @@ -4560,7 +4559,7 @@ pub(crate) mod tests { }]; let alias = vec![("a".to_string(), "a".to_string())]; let input = parquet_exec_with_sort(vec![sort_key]); - let physical_plan = aggregate_exec_with_alias(input, alias.clone()); + let physical_plan = aggregate_exec_with_alias(input, alias); let expected = &[ "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", @@ -4584,7 +4583,7 @@ pub(crate) mod tests { let alias = vec![("a".to_string(), "a".to_string())]; let input = parquet_exec_multiple_sorted(vec![sort_key]); let aggregate = aggregate_exec_with_alias(input, alias.clone()); - let physical_plan = aggregate_exec_with_alias(aggregate, alias.clone()); + let physical_plan = aggregate_exec_with_alias(aggregate, alias); let expected = &[ "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", diff --git a/datafusion/core/src/physical_optimizer/join_selection.rs b/datafusion/core/src/physical_optimizer/join_selection.rs index b849df88e4aa..2643ade8f481 100644 --- a/datafusion/core/src/physical_optimizer/join_selection.rs +++ b/datafusion/core/src/physical_optimizer/join_selection.rs @@ -908,7 +908,7 @@ mod tests_statistical { ); let optimized_join = JoinSelection::new() - .optimize(join.clone(), &ConfigOptions::new()) + .optimize(join, &ConfigOptions::new()) .unwrap(); let swapping_projection = optimized_join @@ -964,7 +964,7 @@ mod tests_statistical { ); let optimized_join = JoinSelection::new() - .optimize(join.clone(), &ConfigOptions::new()) + .optimize(join, &ConfigOptions::new()) .unwrap(); let swapped_join = optimized_join @@ -1140,7 +1140,7 @@ mod tests_statistical { ); let optimized_join = JoinSelection::new() - .optimize(join.clone(), &ConfigOptions::new()) + .optimize(join, &ConfigOptions::new()) .unwrap(); let swapped_join = optimized_join @@ -1180,7 +1180,7 @@ mod tests_statistical { ); let optimized_join = JoinSelection::new() - .optimize(join.clone(), &ConfigOptions::new()) + .optimize(join, &ConfigOptions::new()) .unwrap(); let swapping_projection = optimized_join @@ -1356,7 +1356,7 @@ mod tests_statistical { Arc::new(Column::new_with_schema("small_col", &small.schema()).unwrap()) as _, )]; check_join_partition_mode( - big.clone(), + big, small.clone(), join_on, true, @@ -1380,8 +1380,8 @@ mod tests_statistical { Arc::new(Column::new_with_schema("small_col", &small.schema()).unwrap()) as _, )]; check_join_partition_mode( - empty.clone(), - small.clone(), + empty, + small, join_on, true, PartitionMode::CollectLeft, @@ -1424,7 +1424,7 @@ mod tests_statistical { Arc::new(Column::new_with_schema("big_col", &big.schema()).unwrap()) as _, )]; check_join_partition_mode( - bigger.clone(), + bigger, big.clone(), join_on, true, @@ -1472,7 +1472,7 @@ mod tests_statistical { ); let optimized_join = JoinSelection::new() - .optimize(join.clone(), &ConfigOptions::new()) + .optimize(join, &ConfigOptions::new()) .unwrap(); if !is_swapped { @@ -1913,8 +1913,7 @@ mod hash_join_tests { false, )?); - let optimized_join_plan = - hash_join_swap_subrule(join.clone(), &ConfigOptions::new())?; + let optimized_join_plan = hash_join_swap_subrule(join, &ConfigOptions::new())?; // If swap did happen let projection_added = optimized_join_plan.as_any().is::(); diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs index 9c545c17da3c..b3f3f90154d0 100644 --- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs @@ -1692,12 +1692,9 @@ mod tests { ])); Arc::new( CsvExec::builder( - FileScanConfig::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema.clone(), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![0, 1, 2, 3, 4])), + FileScanConfig::new(ObjectStoreUrl::parse("test:///").unwrap(), schema) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_projection(Some(vec![0, 1, 2, 3, 4])), ) .with_has_header(false) .with_delimeter(0) @@ -1719,12 +1716,9 @@ mod tests { ])); Arc::new( CsvExec::builder( - FileScanConfig::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema.clone(), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![3, 2, 1])), + FileScanConfig::new(ObjectStoreUrl::parse("test:///").unwrap(), schema) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_projection(Some(vec![3, 2, 1])), ) .with_has_header(false) .with_delimeter(0) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 0ef390fff45c..a16abc607ee6 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -1369,7 +1369,6 @@ fn build_predicate_expression( let change_expr = in_list .list() .iter() - .cloned() .map(|e| { Arc::new(phys_expr::BinaryExpr::new( in_list.expr().clone(), diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 9501d3c6bbbb..fe8d79846630 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -701,7 +701,7 @@ impl DefaultPhysicalPlanner { let initial_aggr = Arc::new(AggregateExec::try_new( AggregateMode::Partial, groups.clone(), - aggregates.clone(), + aggregates, filters.clone(), input_exec, physical_input_schema.clone(), @@ -2569,7 +2569,7 @@ mod tests { impl NoOpExecutionPlan { fn new(schema: SchemaRef) -> Self { - let cache = Self::compute_properties(schema.clone()); + let cache = Self::compute_properties(schema); Self { cache } } diff --git a/datafusion/core/tests/fuzz_cases/join_fuzz.rs b/datafusion/core/tests/fuzz_cases/join_fuzz.rs index f1cca66712d7..1c2d8ece2f36 100644 --- a/datafusion/core/tests/fuzz_cases/join_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/join_fuzz.rs @@ -350,12 +350,10 @@ impl JoinFuzzTestCase { fn left_right(&self) -> (Arc, Arc) { let schema1 = self.input1[0].schema(); let schema2 = self.input2[0].schema(); - let left = Arc::new( - MemoryExec::try_new(&[self.input1.clone()], schema1.clone(), None).unwrap(), - ); - let right = Arc::new( - MemoryExec::try_new(&[self.input2.clone()], schema2.clone(), None).unwrap(), - ); + let left = + Arc::new(MemoryExec::try_new(&[self.input1.clone()], schema1, None).unwrap()); + let right = + Arc::new(MemoryExec::try_new(&[self.input2.clone()], schema2, None).unwrap()); (left, right) } diff --git a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs index b051feb5750e..750544ecdec1 100644 --- a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs @@ -54,7 +54,7 @@ impl PartitionStream for DummyStreamPartition { fn transforms_streaming_table_exec_into_fetching_version_when_skip_is_zero( ) -> datafusion_common::Result<()> { let schema = create_schema(); - let streaming_table = streaming_table_exec(schema.clone())?; + let streaming_table = streaming_table_exec(schema)?; let global_limit = global_limit_exec(streaming_table, 0, Some(5)); let initial = get_plan_string(&global_limit); @@ -79,7 +79,7 @@ fn transforms_streaming_table_exec_into_fetching_version_when_skip_is_zero( fn transforms_streaming_table_exec_into_fetching_version_and_keeps_the_global_limit_when_skip_is_nonzero( ) -> datafusion_common::Result<()> { let schema = create_schema(); - let streaming_table = streaming_table_exec(schema.clone())?; + let streaming_table = streaming_table_exec(schema)?; let global_limit = global_limit_exec(streaming_table, 2, Some(5)); let initial = get_plan_string(&global_limit); @@ -107,7 +107,7 @@ fn transforms_coalesce_batches_exec_into_fetching_version_and_removes_local_limi let schema = create_schema(); let streaming_table = streaming_table_exec(schema.clone())?; let repartition = repartition_exec(streaming_table)?; - let filter = filter_exec(schema.clone(), repartition)?; + let filter = filter_exec(schema, repartition)?; let coalesce_batches = coalesce_batches_exec(filter); let local_limit = local_limit_exec(coalesce_batches, 5); let coalesce_partitions = coalesce_partitions_exec(local_limit); @@ -146,7 +146,7 @@ fn pushes_global_limit_exec_through_projection_exec() -> datafusion_common::Resu let schema = create_schema(); let streaming_table = streaming_table_exec(schema.clone())?; let filter = filter_exec(schema.clone(), streaming_table)?; - let projection = projection_exec(schema.clone(), filter)?; + let projection = projection_exec(schema, filter)?; let global_limit = global_limit_exec(projection, 0, Some(5)); let initial = get_plan_string(&global_limit); @@ -178,7 +178,7 @@ fn pushes_global_limit_exec_through_projection_exec_and_transforms_coalesce_batc let schema = create_schema(); let streaming_table = streaming_table_exec(schema.clone()).unwrap(); let coalesce_batches = coalesce_batches_exec(streaming_table); - let projection = projection_exec(schema.clone(), coalesce_batches)?; + let projection = projection_exec(schema, coalesce_batches)?; let global_limit = global_limit_exec(projection, 0, Some(5)); let initial = get_plan_string(&global_limit); @@ -256,7 +256,7 @@ fn keeps_pushed_local_limit_exec_when_there_are_multiple_input_partitions( let schema = create_schema(); let streaming_table = streaming_table_exec(schema.clone())?; let repartition = repartition_exec(streaming_table)?; - let filter = filter_exec(schema.clone(), repartition)?; + let filter = filter_exec(schema, repartition)?; let coalesce_partitions = coalesce_partitions_exec(filter); let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5)); @@ -398,9 +398,7 @@ fn streaming_table_exec( ) -> datafusion_common::Result> { Ok(Arc::new(StreamingTableExec::try_new( schema.clone(), - vec![Arc::new(DummyStreamPartition { - schema: schema.clone(), - }) as _], + vec![Arc::new(DummyStreamPartition { schema }) as _], None, None, true, diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs index 48389b0304f6..042f6d622565 100644 --- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs @@ -316,11 +316,11 @@ fn test_no_group_by() -> Result<()> { // `SELECT FROM MemoryExec LIMIT 10;`, Single AggregateExec let single_agg = AggregateExec::try_new( AggregateMode::Single, - build_group_by(&schema.clone(), vec![]), - vec![], /* aggr_expr */ - vec![], /* filter_expr */ - source, /* input */ - schema.clone(), /* input_schema */ + build_group_by(&schema, vec![]), + vec![], /* aggr_expr */ + vec![], /* filter_expr */ + source, /* input */ + schema, /* input_schema */ )?; let limit_exec = LocalLimitExec::new( Arc::new(single_agg), @@ -346,7 +346,7 @@ fn test_has_aggregate_expression() -> Result<()> { // `SELECT FROM MemoryExec LIMIT 10;`, Single AggregateExec let single_agg = AggregateExec::try_new( AggregateMode::Single, - build_group_by(&schema.clone(), vec!["a".to_string()]), + build_group_by(&schema, vec!["a".to_string()]), vec![agg.count_expr(&schema)], /* aggr_expr */ vec![None], /* filter_expr */ source, /* input */ @@ -418,11 +418,11 @@ fn test_has_order_by() -> Result<()> { // the `a > 1` filter is applied in the AggregateExec let single_agg = AggregateExec::try_new( AggregateMode::Single, - build_group_by(&schema.clone(), vec!["a".to_string()]), - vec![], /* aggr_expr */ - vec![], /* filter_expr */ - source, /* input */ - schema.clone(), /* input_schema */ + build_group_by(&schema, vec!["a".to_string()]), + vec![], /* aggr_expr */ + vec![], /* filter_expr */ + source, /* input */ + schema, /* input_schema */ )?; let limit_exec = LocalLimitExec::new( Arc::new(single_agg), diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 93550d38021a..1e0d3d9d514e 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -272,7 +272,7 @@ async fn deregister_udaf() -> Result<()> { Arc::new(vec![DataType::UInt64, DataType::Float64]), ); - ctx.register_udaf(my_avg.clone()); + ctx.register_udaf(my_avg); assert!(ctx.state().aggregate_functions().contains_key("my_avg")); diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index 259cce74f2e5..0f1c3b8e53c4 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -594,7 +594,7 @@ async fn deregister_udf() -> Result<()> { let cast2i64 = ScalarUDF::from(CastToI64UDF::new()); let ctx = SessionContext::new(); - ctx.register_udf(cast2i64.clone()); + ctx.register_udf(cast2i64); assert!(ctx.udfs().contains("cast_to_i64")); diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs index d3cd93979baf..e169c1f319cc 100644 --- a/datafusion/execution/src/memory_pool/pool.rs +++ b/datafusion/execution/src/memory_pool/pool.rs @@ -366,7 +366,7 @@ impl MemoryPool for TrackConsumersPool { // wrap OOM message in top consumers DataFusionError::ResourcesExhausted( provide_top_memory_consumers_to_error_msg( - e.to_owned(), + e, self.report_top(self.top.into()), ), ) @@ -540,7 +540,7 @@ mod tests { // Test: will be the same per Top Consumers reported. r0.grow(10); // make r0=10, pool available=90 let new_consumer_same_name = MemoryConsumer::new(same_name); - let mut r1 = new_consumer_same_name.clone().register(&pool); + let mut r1 = new_consumer_same_name.register(&pool); // TODO: the insufficient_capacity_err() message is per reservation, not per consumer. // a followup PR will clarify this message "0 bytes already allocated for this reservation" let expected = "Additional allocation failed with top memory consumers (across reservations) as: foo consumed 10 bytes. Error: Failed to allocate additional 150 bytes for foo with 0 bytes already allocated for this reservation - 90 bytes remain available for the total pool"; diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index e3ff412e785b..6424888c896a 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -1877,11 +1877,7 @@ mod tests { .sub(value.clone()) .unwrap() .lt(&eps)); - assert!(value - .clone() - .sub(prev_value(value.clone())) - .unwrap() - .lt(&eps)); + assert!(value.sub(prev_value(value.clone())).unwrap().lt(&eps)); assert_ne!(next_value(value.clone()), value); assert_ne!(prev_value(value.clone()), value); }); @@ -1913,11 +1909,11 @@ mod tests { min_max.into_iter().zip(inf).for_each(|((min, max), inf)| { assert_eq!(next_value(max.clone()), inf); assert_ne!(prev_value(max.clone()), max); - assert_ne!(prev_value(max.clone()), inf); + assert_ne!(prev_value(max), inf); assert_eq!(prev_value(min.clone()), inf); assert_ne!(next_value(min.clone()), min); - assert_ne!(next_value(min.clone()), inf); + assert_ne!(next_value(min), inf); assert_eq!(next_value(inf.clone()), inf); assert_eq!(prev_value(inf.clone()), inf); diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 552bc89b38f4..3617f56905a9 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -1881,7 +1881,7 @@ mod tests { ); test_coercion_binary_rule!( DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())), - DataType::Timestamp(TimeUnit::Second, utc.clone()), + DataType::Timestamp(TimeUnit::Second, utc), Operator::Eq, DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())) ); diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 10ec10e61239..3920a1a3517c 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -184,7 +184,7 @@ impl ExprSchemable for Expr { err, utils::generate_signature_error_msg( fun.name(), - fun.signature().clone(), + fun.signature(), &data_types ) ) @@ -199,7 +199,7 @@ impl ExprSchemable for Expr { err, utils::generate_signature_error_msg( fun.name(), - fun.signature().clone(), + fun.signature(), &data_types ) ) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 9894fe887de0..2c2300b123c2 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -2072,7 +2072,7 @@ mod tests { let schema = Schema::new(vec![ Field::new("scalar", DataType::UInt32, false), Field::new_list("strings", string_field, false), - Field::new_list("structs", struct_field_in_list.clone(), false), + Field::new_list("structs", struct_field_in_list, false), Field::new( "struct_singular", DataType::Struct(Fields::from(vec![ diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs index 343eda056ffe..5a881deb54e1 100644 --- a/datafusion/expr/src/logical_plan/display.rs +++ b/datafusion/expr/src/logical_plan/display.rs @@ -387,19 +387,16 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> { } if !full_filter.is_empty() { - object["Full Filters"] = serde_json::Value::String( - expr_vec_fmt!(full_filter).to_string(), - ); + object["Full Filters"] = + serde_json::Value::String(expr_vec_fmt!(full_filter)); }; if !partial_filter.is_empty() { - object["Partial Filters"] = serde_json::Value::String( - expr_vec_fmt!(partial_filter).to_string(), - ); + object["Partial Filters"] = + serde_json::Value::String(expr_vec_fmt!(partial_filter)); } if !unsupported_filters.is_empty() { - object["Unsupported Filters"] = serde_json::Value::String( - expr_vec_fmt!(unsupported_filters).to_string(), - ); + object["Unsupported Filters"] = + serde_json::Value::String(expr_vec_fmt!(unsupported_filters)); } } @@ -595,9 +592,8 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> { "Select": expr_vec_fmt!(select_expr), }); if let Some(sort_expr) = sort_expr { - object["Sort"] = serde_json::Value::String( - expr_vec_fmt!(sort_expr).to_string(), - ); + object["Sort"] = + serde_json::Value::String(expr_vec_fmt!(sort_expr)); } object diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs index b5c6171af37c..8bbcf756c37c 100644 --- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs +++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs @@ -167,7 +167,7 @@ where // Rebuilding input values with a new nulls mask, which is equal to // the union of original nulls and filter mask - let (dt, values_buf, original_nulls) = values.clone().into_parts(); + let (dt, values_buf, original_nulls) = values.into_parts(); let nulls_buf = NullBuffer::union(original_nulls.as_ref(), Some(&filter_nulls)); PrimitiveArray::::new(values_buf, nulls_buf).with_data_type(dt) diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 867dfed485c7..5578aebbf403 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -126,9 +126,9 @@ impl ApproxPercentileCont { | DataType::Float32 | DataType::Float64) => { if let Some(max_size) = tdigest_max_size { - ApproxPercentileAccumulator::new_with_max_size(percentile, t.clone(), max_size) + ApproxPercentileAccumulator::new_with_max_size(percentile, t, max_size) }else{ - ApproxPercentileAccumulator::new(percentile, t.clone()) + ApproxPercentileAccumulator::new(percentile, t) } } diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index b641d388a7c5..15146fc4a2d8 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -501,11 +501,8 @@ impl OrderSensitiveArrayAggAccumulator { column_wise_ordering_values.push(array); } - let ordering_array = StructArray::try_new( - struct_field.clone(), - column_wise_ordering_values, - None, - )?; + let ordering_array = + StructArray::try_new(struct_field, column_wise_ordering_values, None)?; Ok(ScalarValue::List(Arc::new(array_into_list_array_nullable( Arc::new(ordering_array), )))) diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index cb1ddd4738c4..7425bdfa18e7 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -374,11 +374,8 @@ impl NthValueAccumulator { column_wise_ordering_values.push(array); } - let ordering_array = StructArray::try_new( - struct_field.clone(), - column_wise_ordering_values, - None, - )?; + let ordering_array = + StructArray::try_new(struct_field, column_wise_ordering_values, None)?; Ok(ScalarValue::List(Arc::new(array_into_list_array_nullable( Arc::new(ordering_array), diff --git a/datafusion/functions-nested/src/flatten.rs b/datafusion/functions-nested/src/flatten.rs index 2b383af3d456..b04c35667226 100644 --- a/datafusion/functions-nested/src/flatten.rs +++ b/datafusion/functions-nested/src/flatten.rs @@ -147,7 +147,7 @@ fn flatten_internal( let list_arr = GenericListArray::::new(field, offsets, values, None); Ok(list_arr) } else { - Ok(list_arr.clone()) + Ok(list_arr) } } } diff --git a/datafusion/functions-nested/src/utils.rs b/datafusion/functions-nested/src/utils.rs index 688e1633e5cf..3d5b261618d5 100644 --- a/datafusion/functions-nested/src/utils.rs +++ b/datafusion/functions-nested/src/utils.rs @@ -296,8 +296,7 @@ mod tests { let array3d_1 = Arc::new(array_into_list_array_nullable(array2d_1)) as ArrayRef; let array3d_2 = array_into_list_array_nullable(array2d_2.to_owned()); let res = - align_array_dimensions::(vec![array1d_1, Arc::new(array3d_2.clone())]) - .unwrap(); + align_array_dimensions::(vec![array1d_1, Arc::new(array3d_2)]).unwrap(); let expected = as_list_array(&array3d_1).unwrap(); let expected_dim = datafusion_common::utils::list_ndims(array3d_1.data_type()); diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 3fcee4123b76..3a2b190359d4 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -1431,7 +1431,7 @@ mod test { fn nested_aliases() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan.clone()) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ (col("a") + col("b") - col("c")).alias("alias1") * (col("a") + col("b")), col("a") + col("b"), @@ -1842,7 +1842,7 @@ mod test { let config = &OptimizerContext::new(); let _common_expr_1 = config.alias_generator().next(CSE_PREFIX); let common_expr_2 = config.alias_generator().next(CSE_PREFIX); - let plan = LogicalPlanBuilder::from(table_scan.clone()) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ (col("a") + col("b")).alias(common_expr_2.clone()), col("c"), @@ -1886,7 +1886,7 @@ mod test { let extracted_short_circuit_leg_1 = (col("a") + col("b")).eq(lit(0)); let not_extracted_short_circuit_leg_2 = (col("a") - col("b")).eq(lit(0)); let extracted_short_circuit_leg_3 = (col("a") * col("b")).eq(lit(0)); - let plan = LogicalPlanBuilder::from(table_scan.clone()) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ extracted_short_circuit.clone().alias("c1"), extracted_short_circuit.alias("c2"), @@ -1899,7 +1899,7 @@ mod test { .alias("c4"), extracted_short_circuit_leg_3 .clone() - .or(extracted_short_circuit_leg_3.clone()) + .or(extracted_short_circuit_leg_3) .alias("c5"), ])? .build()?; @@ -1920,7 +1920,7 @@ mod test { let extracted_child = col("a") + col("b"); let rand = rand_func().call(vec![]); let not_extracted_volatile = extracted_child + rand; - let plan = LogicalPlanBuilder::from(table_scan.clone()) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ not_extracted_volatile.clone().alias("c1"), not_extracted_volatile.alias("c2"), @@ -1947,7 +1947,7 @@ mod test { let not_extracted_short_circuit_leg_2 = col("b").eq(lit(0)); let not_extracted_volatile_short_circuit_2 = rand.eq(lit(0)).or(not_extracted_short_circuit_leg_2); - let plan = LogicalPlanBuilder::from(table_scan.clone()) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ not_extracted_volatile_short_circuit_1.clone().alias("c1"), not_extracted_volatile_short_circuit_1.alias("c2"), diff --git a/datafusion/optimizer/src/eliminate_nested_union.rs b/datafusion/optimizer/src/eliminate_nested_union.rs index e9b38567a982..965771326854 100644 --- a/datafusion/optimizer/src/eliminate_nested_union.rs +++ b/datafusion/optimizer/src/eliminate_nested_union.rs @@ -144,10 +144,7 @@ mod tests { fn eliminate_nothing() -> Result<()> { let plan_builder = table_scan(Some("table"), &schema(), None)?; - let plan = plan_builder - .clone() - .union(plan_builder.clone().build()?)? - .build()?; + let plan = plan_builder.clone().union(plan_builder.build()?)?.build()?; let expected = "\ Union\ @@ -162,7 +159,7 @@ mod tests { let plan = plan_builder .clone() - .union_distinct(plan_builder.clone().build()?)? + .union_distinct(plan_builder.build()?)? .build()?; let expected = "Distinct:\ @@ -180,7 +177,7 @@ mod tests { .clone() .union(plan_builder.clone().build()?)? .union(plan_builder.clone().build()?)? - .union(plan_builder.clone().build()?)? + .union(plan_builder.build()?)? .build()?; let expected = "\ @@ -200,7 +197,7 @@ mod tests { .clone() .union_distinct(plan_builder.clone().build()?)? .union(plan_builder.clone().build()?)? - .union(plan_builder.clone().build()?)? + .union(plan_builder.build()?)? .build()?; let expected = "Union\ @@ -222,7 +219,7 @@ mod tests { .union(plan_builder.clone().build()?)? .union_distinct(plan_builder.clone().build()?)? .union(plan_builder.clone().build()?)? - .union_distinct(plan_builder.clone().build()?)? + .union_distinct(plan_builder.build()?)? .build()?; let expected = "Distinct:\ @@ -243,7 +240,7 @@ mod tests { .clone() .union_distinct(plan_builder.clone().distinct()?.build()?)? .union(plan_builder.clone().distinct()?.build()?)? - .union_distinct(plan_builder.clone().build()?)? + .union_distinct(plan_builder.build()?)? .build()?; let expected = "Distinct:\ @@ -271,7 +268,6 @@ mod tests { )? .union( plan_builder - .clone() .project(vec![col("id").alias("_id"), col("key"), col("value")])? .build()?, )? @@ -300,7 +296,6 @@ mod tests { )? .union_distinct( plan_builder - .clone() .project(vec![col("id").alias("_id"), col("key"), col("value")])? .build()?, )? diff --git a/datafusion/optimizer/src/eliminate_one_union.rs b/datafusion/optimizer/src/eliminate_one_union.rs index 383b33637f6f..7a1c4e118e05 100644 --- a/datafusion/optimizer/src/eliminate_one_union.rs +++ b/datafusion/optimizer/src/eliminate_one_union.rs @@ -94,10 +94,7 @@ mod tests { fn eliminate_nothing() -> Result<()> { let plan_builder = table_scan(Some("table"), &schema(), None)?; - let plan = plan_builder - .clone() - .union(plan_builder.clone().build()?)? - .build()?; + let plan = plan_builder.clone().union(plan_builder.build()?)?.build()?; let expected = "\ Union\ diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 5db5afd11062..35b0d07751ff 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -1336,8 +1336,8 @@ mod tests { let right_table = test_table_scan_with_name("r")?; let custom_plan = LogicalPlan::Extension(Extension { node: Arc::new(UserDefinedCrossJoin::new( - Arc::new(left_table.clone()), - Arc::new(right_table.clone()), + Arc::new(left_table), + Arc::new(right_table), )), }); let plan = LogicalPlanBuilder::from(custom_plan) diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 6e75f46c2d0b..82149a087e63 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -819,7 +819,7 @@ impl OptimizerRule for PushDownFilter { .map(|e| Ok(Column::from_qualified_name(e.schema_name().to_string()))) .collect::>>()?; - let predicates = split_conjunction_owned(filter.predicate.clone()); + let predicates = split_conjunction_owned(filter.predicate); let mut keep_predicates = vec![]; let mut push_predicates = vec![]; diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index c45df74a564d..7129ceb0fea1 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3407,32 +3407,32 @@ mod tests { let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], false), ); - assert_eq!(simplify(expr.clone()), lit(false)); + assert_eq!(simplify(expr), lit(false)); // 2. c1 IN (1,2,3,4) AND c1 IN (4,5,6,7) -> c1 = 4 let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( in_list(col("c1"), vec![lit(4), lit(5), lit(6), lit(7)], false), ); - assert_eq!(simplify(expr.clone()), col("c1").eq(lit(4))); + assert_eq!(simplify(expr), col("c1").eq(lit(4))); // 3. c1 NOT IN (1, 2, 3, 4) OR c1 NOT IN (5, 6, 7, 8) -> true let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).or( in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], true), ); - assert_eq!(simplify(expr.clone()), lit(true)); + assert_eq!(simplify(expr), lit(true)); // 3.5 c1 NOT IN (1, 2, 3, 4) OR c1 NOT IN (4, 5, 6, 7) -> c1 != 4 (4 overlaps) let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).or( in_list(col("c1"), vec![lit(4), lit(5), lit(6), lit(7)], true), ); - assert_eq!(simplify(expr.clone()), col("c1").not_eq(lit(4))); + assert_eq!(simplify(expr), col("c1").not_eq(lit(4))); // 4. c1 NOT IN (1,2,3,4) AND c1 NOT IN (4,5,6,7) -> c1 NOT IN (1,2,3,4,5,6,7) let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).and( in_list(col("c1"), vec![lit(4), lit(5), lit(6), lit(7)], true), ); assert_eq!( - simplify(expr.clone()), + simplify(expr), in_list( col("c1"), vec![lit(1), lit(2), lit(3), lit(4), lit(5), lit(6), lit(7)], @@ -3445,7 +3445,7 @@ mod tests { in_list(col("c1"), vec![lit(2), lit(3), lit(4), lit(5)], false), ); assert_eq!( - simplify(expr.clone()), + simplify(expr), in_list( col("c1"), vec![lit(1), lit(2), lit(3), lit(4), lit(5)], @@ -3459,7 +3459,7 @@ mod tests { vec![lit(1), lit(2), lit(3), lit(4), lit(5)], true, )); - assert_eq!(simplify(expr.clone()), lit(false)); + assert_eq!(simplify(expr), lit(false)); // 7. c1 NOT IN (1,2,3,4) AND c1 IN (1,2,3,4,5) -> c1 = 5 let expr = @@ -3468,14 +3468,14 @@ mod tests { vec![lit(1), lit(2), lit(3), lit(4), lit(5)], false, )); - assert_eq!(simplify(expr.clone()), col("c1").eq(lit(5))); + assert_eq!(simplify(expr), col("c1").eq(lit(5))); // 8. c1 IN (1,2,3,4) AND c1 NOT IN (5,6,7,8) -> c1 IN (1,2,3,4) let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], true), ); assert_eq!( - simplify(expr.clone()), + simplify(expr), in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false) ); @@ -3493,7 +3493,7 @@ mod tests { )) .and(in_list(col("c1"), vec![lit(3), lit(6)], false)); assert_eq!( - simplify(expr.clone()), + simplify(expr), col("c1").eq(lit(3)).or(col("c1").eq(lit(6))) ); @@ -3507,7 +3507,7 @@ mod tests { )) .and(in_list(col("c1"), vec![lit(8), lit(9), lit(10)], false)), ); - assert_eq!(simplify(expr.clone()), col("c1").eq(lit(8))); + assert_eq!(simplify(expr), col("c1").eq(lit(8))); // Contains non-InList expression // c1 NOT IN (1,2,3,4) OR c1 != 5 OR c1 NOT IN (6,7,8,9) -> c1 NOT IN (1,2,3,4) OR c1 != 5 OR c1 NOT IN (6,7,8,9) @@ -3622,7 +3622,7 @@ mod tests { let expr_x = col("c3").gt(lit(3_i64)); let expr_y = (col("c4") + lit(2_u32)).lt(lit(10_u32)); let expr_z = col("c1").in_list(vec![lit("a"), lit("b")], true); - let expr = expr_x.clone().and(expr_y.clone().or(expr_z)); + let expr = expr_x.clone().and(expr_y.or(expr_z)); // All guaranteed null let guarantees = vec![ @@ -3698,7 +3698,7 @@ mod tests { col("c4"), NullableInterval::from(ScalarValue::UInt32(Some(3))), )]; - let output = simplify_with_guarantee(expr.clone(), guarantees); + let output = simplify_with_guarantee(expr, guarantees); assert_eq!(&output, &expr_x); } diff --git a/datafusion/optimizer/src/simplify_expressions/guarantees.rs b/datafusion/optimizer/src/simplify_expressions/guarantees.rs index 09fdd7685a9c..afcbe528083b 100644 --- a/datafusion/optimizer/src/simplify_expressions/guarantees.rs +++ b/datafusion/optimizer/src/simplify_expressions/guarantees.rs @@ -225,12 +225,12 @@ mod tests { // x IS NULL => guaranteed false let expr = col("x").is_null(); - let output = expr.clone().rewrite(&mut rewriter).data().unwrap(); + let output = expr.rewrite(&mut rewriter).data().unwrap(); assert_eq!(output, lit(false)); // x IS NOT NULL => guaranteed true let expr = col("x").is_not_null(); - let output = expr.clone().rewrite(&mut rewriter).data().unwrap(); + let output = expr.rewrite(&mut rewriter).data().unwrap(); assert_eq!(output, lit(true)); } diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index e0f50a470d43..b17d69437cbe 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -627,7 +627,7 @@ mod tests { Box::new(DataType::Int32), Box::new(ScalarValue::LargeUtf8(Some("value".to_owned()))), ); - let expr_input = cast(col("largestr"), dict.data_type()).eq(lit(dict.clone())); + let expr_input = cast(col("largestr"), dict.data_type()).eq(lit(dict)); let expected = col("largestr").eq(lit(ScalarValue::LargeUtf8(Some("value".to_owned())))); assert_eq!(optimize_test(expr_input, &schema), expected); diff --git a/datafusion/physical-expr/src/equivalence/ordering.rs b/datafusion/physical-expr/src/equivalence/ordering.rs index c4b8a5c46563..49a0de7252ab 100644 --- a/datafusion/physical-expr/src/equivalence/ordering.rs +++ b/datafusion/physical-expr/src/equivalence/ordering.rs @@ -272,7 +272,7 @@ mod tests { // Crude ordering doesn't satisfy finer ordering. should return false let mut eq_properties_crude = EquivalenceProperties::new(Arc::clone(&input_schema)); - eq_properties_crude.oeq_class.push(crude.clone()); + eq_properties_crude.oeq_class.push(crude); assert!(!eq_properties_crude.ordering_satisfy(&finer)); Ok(()) } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index b663d8614275..2680a7930ff1 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -2582,7 +2582,7 @@ mod tests { &a, &b, Operator::RegexIMatch, - regex_expected.clone(), + regex_expected, )?; apply_logic_op( &Arc::new(schema.clone()), @@ -2596,7 +2596,7 @@ mod tests { &a, &b, Operator::RegexNotIMatch, - regex_not_expected.clone(), + regex_not_expected, )?; Ok(()) diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index c6afb5c05985..712175c9afbe 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -380,7 +380,7 @@ impl CaseExpr { // keep `else_expr`'s data type and return type consistent let e = self.else_expr.as_ref().unwrap(); - let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone()) + let expr = try_cast(Arc::clone(e), &batch.schema(), return_type) .unwrap_or_else(|_| Arc::clone(e)); let else_ = Scalar::new(expr.evaluate(batch)?.into_array(1)?); diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 0aeaa15b5f1d..6ca01928c80f 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -2220,7 +2220,7 @@ mod tests { let aggregate_exec = Arc::new(AggregateExec::try_new( AggregateMode::Partial, groups, - aggregates.clone(), + aggregates, vec![None, None], Arc::clone(&blocking_exec) as Arc, schema, diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 8fdbf7041e2f..89f3feaf07be 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -2443,7 +2443,7 @@ mod tests { Statistics { num_rows: Absent, total_byte_size: Absent, - column_statistics: dummy_column_stats.clone(), + column_statistics: dummy_column_stats, }, &join_on, ); diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs index bd9303f97db0..e9ea9d4f5032 100644 --- a/datafusion/physical-plan/src/recursive_query.rs +++ b/datafusion/physical-plan/src/recursive_query.rs @@ -394,7 +394,7 @@ impl Stream for RecursiveQueryStream { self.recursive_stream = None; self.poll_next_iteration(cx) } - Some(Ok(batch)) => self.push_batch(batch.clone()), + Some(Ok(batch)) => self.push_batch(batch), _ => Poll::Ready(batch_result), } } else { diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 084436ee376d..c1bcd83a6fd2 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1312,7 +1312,7 @@ mod tests { &args, &partitionby_exprs, &orderby_exprs, - Arc::new(window_frame.clone()), + Arc::new(window_frame), &input.schema(), false, )?], @@ -1485,7 +1485,7 @@ mod tests { let partitions = vec![ Arc::new(TestStreamPartition { schema: Arc::clone(&schema), - batches: batches.clone(), + batches, idx: 0, state: PolingState::BatchReturn, sleep_duration: per_batch_wait_duration, diff --git a/datafusion/proto/gen/src/main.rs b/datafusion/proto/gen/src/main.rs index d3b3c92f6065..be61ff58fa8d 100644 --- a/datafusion/proto/gen/src/main.rs +++ b/datafusion/proto/gen/src/main.rs @@ -55,7 +55,7 @@ fn main() -> Result<(), String> { let common_path = proto_dir.join("src/datafusion_common.rs"); println!( "Copying {} to {}", - prost.clone().display(), + prost.display(), proto_dir.join("src/generated/prost.rs").display() ); std::fs::copy(prost, proto_dir.join("src/generated/prost.rs")).unwrap(); diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 4f58185798f7..94ac913e1968 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -2434,7 +2434,7 @@ fn roundtrip_window() { WindowFunctionDefinition::AggregateUDF(avg_udaf()), vec![col("col1")], )) - .window_frame(row_number_frame.clone()) + .window_frame(row_number_frame) .build() .unwrap(); diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 60f5565bdeee..3e49dc24fd5a 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -404,7 +404,7 @@ fn rountrip_aggregate_with_limit() -> Result<()> { let agg = AggregateExec::try_new( AggregateMode::Final, PhysicalGroupBy::new_single(groups.clone()), - aggregates.clone(), + aggregates, vec![None], Arc::new(EmptyExec::new(schema.clone())), schema, @@ -433,7 +433,7 @@ fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> { let agg = AggregateExec::try_new( AggregateMode::Final, PhysicalGroupBy::new_single(groups.clone()), - aggregates.clone(), + aggregates, vec![None], Arc::new(EmptyExec::new(schema.clone())), schema, @@ -469,7 +469,7 @@ fn rountrip_aggregate_with_sort() -> Result<()> { let agg = AggregateExec::try_new( AggregateMode::Final, PhysicalGroupBy::new_single(groups.clone()), - aggregates.clone(), + aggregates, vec![None], Arc::new(EmptyExec::new(schema.clone())), schema, @@ -536,7 +536,7 @@ fn roundtrip_aggregate_udaf() -> Result<()> { Arc::new(AggregateExec::try_new( AggregateMode::Final, PhysicalGroupBy::new_single(groups.clone()), - aggregates.clone(), + aggregates, vec![None], Arc::new(EmptyExec::new(schema.clone())), schema, @@ -640,7 +640,7 @@ fn roundtrip_coalesce_with_fetch() -> Result<()> { )))?; roundtrip_test(Arc::new( - CoalesceBatchesExec::new(Arc::new(EmptyExec::new(schema.clone())), 8096) + CoalesceBatchesExec::new(Arc::new(EmptyExec::new(schema)), 8096) .with_fetch(Some(10)), )) } @@ -991,18 +991,16 @@ fn roundtrip_scalar_udf_extension_codec() -> Result<()> { )), input, )?); - let aggr_expr = AggregateExprBuilder::new( - max_udaf(), - vec![udf_expr.clone() as Arc], - ) - .schema(schema.clone()) - .alias("max") - .build()?; + let aggr_expr = + AggregateExprBuilder::new(max_udaf(), vec![udf_expr as Arc]) + .schema(schema.clone()) + .alias("max") + .build()?; let window = Arc::new(WindowAggExec::try_new( vec![Arc::new(PlainAggregateWindowExpr::new( aggr_expr.clone(), - &[col("author", &schema.clone())?], + &[col("author", &schema)?], &[], Arc::new(WindowFrame::new(None)), ))], @@ -1013,10 +1011,10 @@ fn roundtrip_scalar_udf_extension_codec() -> Result<()> { let aggregate = Arc::new(AggregateExec::try_new( AggregateMode::Final, PhysicalGroupBy::new(vec![], vec![], vec![]), - vec![aggr_expr.clone()], + vec![aggr_expr], vec![None], window, - schema.clone(), + schema, )?); let ctx = SessionContext::new(); @@ -1054,7 +1052,7 @@ fn roundtrip_aggregate_udf_extension_codec() -> Result<()> { Arc::new(BinaryExpr::new( col("published", &schema)?, Operator::And, - Arc::new(BinaryExpr::new(udf_expr.clone(), Operator::Gt, lit(0))), + Arc::new(BinaryExpr::new(udf_expr, Operator::Gt, lit(0))), )), input, )?); @@ -1083,7 +1081,7 @@ fn roundtrip_aggregate_udf_extension_codec() -> Result<()> { vec![aggr_expr], vec![None], window, - schema.clone(), + schema, )?); let ctx = SessionContext::new(); @@ -1158,7 +1156,7 @@ fn roundtrip_json_sink() -> Result<()> { roundtrip_test(Arc::new(DataSinkExec::new( input, data_sink, - schema.clone(), + schema, Some(sort_order), ))) } @@ -1197,7 +1195,7 @@ fn roundtrip_csv_sink() -> Result<()> { Arc::new(DataSinkExec::new( input, data_sink, - schema.clone(), + schema, Some(sort_order), )), &ctx, @@ -1253,7 +1251,7 @@ fn roundtrip_parquet_sink() -> Result<()> { roundtrip_test(Arc::new(DataSinkExec::new( input, data_sink, - schema.clone(), + schema, Some(sort_order), ))) } @@ -1342,7 +1340,7 @@ fn roundtrip_interleave() -> Result<()> { )?; let right = RepartitionExec::try_new( Arc::new(EmptyExec::new(Arc::new(schema_right))), - partition.clone(), + partition, )?; let inputs: Vec> = vec![Arc::new(left), Arc::new(right)]; let interleave = InterleaveExec::try_new(inputs)?; diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 72e08e4b8fb5..71e40c20b80a 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -245,8 +245,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Build Unnest expression if name.eq("unnest") { - let mut exprs = - self.function_args_to_expr(args.clone(), schema, planner_context)?; + let mut exprs = self.function_args_to_expr(args, schema, planner_context)?; if exprs.len() != 1 { return plan_err!("unnest() requires exactly one argument"); } diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 86e49780724b..f8ebb04f3810 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -214,7 +214,7 @@ fn optimize_subquery_sort(plan: LogicalPlan) -> Result> // 2. RANK / ROW_NUMBER ... => Handled by a `WindowAggr` and its requirements. // 3. LIMIT => Handled by a `Sort`, so we need to search for it. let mut has_limit = false; - let new_plan = plan.clone().transform_down(|c| { + let new_plan = plan.transform_down(|c| { if let LogicalPlan::Limit(_) = c { has_limit = true; return Ok(Transformed::no(c)); diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 9ce627aecc76..0dbcba162bc0 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1864,7 +1864,7 @@ mod tests { r#"EXISTS (SELECT * FROM t WHERE (t.a = 1))"#, ), ( - not_exists(Arc::new(dummy_logical_plan.clone())), + not_exists(Arc::new(dummy_logical_plan)), r#"NOT EXISTS (SELECT * FROM t WHERE (t.a = 1))"#, ), ( diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index ed23fada0cfb..cdc7bef06afd 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -203,7 +203,7 @@ fn roundtrip_crossjoin() -> Result<()> { println!("plan {}", plan.display_indent()); let plan_roundtrip = sql_to_rel - .sql_statement_to_plan(roundtrip_statement.clone()) + .sql_statement_to_plan(roundtrip_statement) .unwrap(); let expected = "Projection: j1.j1_id, j2.j2_string\ From 7d8bb0b346643381fbe79abc480cc57a5522a5d6 Mon Sep 17 00:00:00 2001 From: WeblWabl Date: Wed, 28 Aug 2024 07:26:41 -0500 Subject: [PATCH 22/23] Add documentation on `EXPLAIN` and `EXPLAIN ANALYZE` (#12122) * feat(wip): working on adding explain docs Signed-off-by: Devan * working on it Signed-off-by: Devan * working on it -- adding plan descriptions Signed-off-by: Devan * adds descriptions for phys plan and note on parallel Signed-off-by: Devan * adds information about the logical plan Signed-off-by: Devan * Add page to index * Update example, add information on how to get the datafile * Add example of reading explain analyze * Add section on partitioning * prettier * adding aggregate plan explain docs Signed-off-by: Devan * field -> column Signed-off-by: Devan * repartition update Signed-off-by: Devan * prettier Signed-off-by: Devan * clarify some points --------- Signed-off-by: Devan Co-authored-by: Andrew Lamb --- docs/source/index.rst | 1 + docs/source/user-guide/explain-usage.md | 365 ++++++++++++++++++++++++ docs/source/user-guide/sql/explain.md | 2 + 3 files changed, 368 insertions(+) create mode 100644 docs/source/user-guide/explain-usage.md diff --git a/docs/source/index.rst b/docs/source/index.rst index 9c8c886d2502..bb5ea430a321 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -89,6 +89,7 @@ To get started, see user-guide/expressions user-guide/sql/index user-guide/configs + user-guide/explain-usage user-guide/faq .. _toc.library-user-guide: diff --git a/docs/source/user-guide/explain-usage.md b/docs/source/user-guide/explain-usage.md new file mode 100644 index 000000000000..a65fad92d104 --- /dev/null +++ b/docs/source/user-guide/explain-usage.md @@ -0,0 +1,365 @@ + + +# Reading Explain Plans + +## Introduction + +This section describes of how to read a DataFusion query plan. While fully +comprehending all details of these plans requires significant expertise in the +DataFusion engine, this guide will help you get started with the basics. + +Datafusion executes queries using a `query plan`. To see the plan without +running the query, add the keyword `EXPLAIN` to your SQL query or call the +[DataFrame::explain] method + +[dataframe::explain]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.explain + +## Example: Select and filter + +In this section, we run example queries against the `hits.parquet` file. See +[below](#data-in-this-example)) for information on how to get this file. + +Let's see how DataFusion runs a query that selects the top 5 watch lists for the +site `http://domcheloveplanet.ru/`: + +```sql +EXPLAIN SELECT "WatchID" AS wid, "hits.parquet"."ClientIP" AS ip +FROM 'hits.parquet' +WHERE starts_with("URL", 'http://domcheloveplanet.ru/') +ORDER BY wid ASC, ip DESC +LIMIT 5; +``` + +The output will look like + +``` ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: wid ASC NULLS LAST, ip DESC NULLS FIRST, fetch=5 | +| | Projection: hits.parquet.WatchID AS wid, hits.parquet.ClientIP AS ip | +| | Filter: starts_with(hits.parquet.URL, Utf8("http://domcheloveplanet.ru/")) | +| | TableScan: hits.parquet projection=[WatchID, ClientIP, URL], partial_filters=[starts_with(hits.parquet.URL, Utf8("http://domcheloveplanet.ru/"))] | +| physical_plan | SortPreservingMergeExec: [wid@0 ASC NULLS LAST,ip@1 DESC], fetch=5 | +| | SortExec: TopK(fetch=5), expr=[wid@0 ASC NULLS LAST,ip@1 DESC], preserve_partitioning=[true] | +| | ProjectionExec: expr=[WatchID@0 as wid, ClientIP@1 as ip] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: starts_with(URL@2, http://domcheloveplanet.ru/) | +| | ParquetExec: file_groups={16 groups: [[hits.parquet:0..923748528], ...]}, projection=[WatchID, ClientIP, URL], predicate=starts_with(URL@13, http://domcheloveplanet.ru/) | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +2 row(s) fetched. +Elapsed 0.060 seconds. +``` + +There are two sections: logical plan and physical plan + +- **Logical Plan:** is a plan generated for a specific SQL query, DataFrame, or other language without the + knowledge of the underlying data organization. +- **Physical Plan:** is a plan generated from a logical plan along with consideration of the hardware + configuration (e.g number of CPUs) and the underlying data organization (e.g number of files). + This physical plan is specific to your hardware configuration and your data. If you load the same + data to different hardware with different configurations, the same query may generate different query plans. + +Understanding a query plan can help to you understand its performance. For example, when the plan shows your query reads +many files, it signals you to either add more filter in the query to read less data or to modify your file +design to make fewer but larger files. This document focuses on how to read a query plan. How to make a +query run faster depends on the reason it is slow and beyond the scope of this document. + +## Query plans are trees + +A query plan is an upside down tree, and we always read from bottom up. The +physical plan in Figure 1 in tree format will look like + +``` + ▲ + │ + │ +┌─────────────────────────────────────────────────┐ +│ SortPreservingMergeExec │ +│ [wid@0 ASC NULLS LAST,ip@1 DESC] │ +│ fetch=5 │ +└─────────────────────────────────────────────────┘ + ▲ + │ +┌─────────────────────────────────────────────────┐ +│ SortExec TopK(fetch=5), │ +│ expr=[wid@0 ASC NULLS LAST,ip@1 DESC], │ +│ preserve_partitioning=[true] │ +└─────────────────────────────────────────────────┘ + ▲ + │ +┌─────────────────────────────────────────────────┐ +│ ProjectionExec │ +│ expr=[WatchID@0 as wid, ClientIP@1 as ip] │ +└─────────────────────────────────────────────────┘ + ▲ + │ +┌─────────────────────────────────────────────────┐ +│ CoalesceBatchesExec │ +└─────────────────────────────────────────────────┘ + ▲ + │ +┌─────────────────────────────────────────────────┐ +│ FilterExec │ +│ starts_with(URL@2, http://domcheloveplanet.ru/) │ +└─────────────────────────────────────────────────┘ + ▲ + │ +┌────────────────────────────────────────────────┐ +│ ParquetExec │ +│ hits.parquet (filter = ...) │ +└────────────────────────────────────────────────┘ +``` + +Each node in the tree/plan ends with `Exec` and is sometimes also called an `operator` or `ExecutionPlan` where data is +processed, transformed and sent up. + +1. First, data in parquet the `hits.parquet` file us read in parallel using 16 cores in 16 "partitions" (more on this later) from `ParquetExec`, which applies a first pass at filtering during the scan. +2. Next, the output is filtered using `FilterExec` to ensure only rows where `starts_with(URL, 'http://domcheloveplanet.ru/')` evaluates to true are passed on +3. The `CoalesceBatchesExec` then ensures that the data is grouped into larger batches for processing +4. The `ProjectionExec` then projects the data to rename the `WatchID` and `ClientIP` columns to `wid` and `ip` respectively. +5. The `SortExec` then sorts the data by `wid ASC, ip DESC`. The `Topk(fetch=5)` indicates that a special implementation is used that only tracks and emits the top 5 values in each partition. +6. Finally the `SortPreservingMergeExec` merges the sorted data from all partitions and returns the top 5 rows overall. + +## Understanding large query plans + +A large query plan may look intimidating, but you can quickly understand what it does by following these steps + +1. As always, read from bottom up, one operator at a time. +2. Understand the job of this operator by reading + the [Physical Plan documentation](https://docs.rs/datafusion/latest/datafusion/physical_plan/index.html). +3. Understand the input data of the operator and how large/small it may be. +4. Understand how much data that operator produces and what it would look like. + +If you can answer those questions, you will be able to estimate how much work +that plan has to do and thus how long it will take. However, the `EXPLAIN` just +shows you the plan without executing it. + +If you want to know more about how much work each operator in query plan does, +you can use the `EXPLAIN ANALYZE` to get the explain with runtime added (see +next section) + +## More Debugging Information: `EXPLAIN VERBOSE` + +If the plan has to read too many files, not all of them will be shown in the +`EXPLAIN`. To see them, use `EXPLAIN VEBOSE`. Like `EXPLAIN`, `EXPLAIN VERBOSE` +does not run the query. Instead it shows the full explain plan, with information +that is omitted from the default explain, as well as all intermediate physical +plans DataFusion generates before returning. This mode can be very helpful for +debugging to see why and when DataFusion added and removed operators from a plan. + +## Execution Counters: `EXPLAIN ANALYZE` + +During execution, DataFusion operators collect detailed metrics. You can access +them programmatically via [`ExecutionPlan::metrics`] as well as with the +`EXPLAIN ANALYZE` command. For example here is the same query query as +above but with `EXPLAIN ANALYZE` (note the output is edited for clarity) + +[`executionplan::metrics`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#method.metrics + +``` +> EXPLAIN ANALYZE SELECT "WatchID" AS wid, "hits.parquet"."ClientIP" AS ip +FROM 'hits.parquet' +WHERE starts_with("URL", 'http://domcheloveplanet.ru/') +ORDER BY wid ASC, ip DESC +LIMIT 5; ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Plan with Metrics | SortPreservingMergeExec: [wid@0 ASC NULLS LAST,ip@1 DESC], fetch=5, metrics=[output_rows=5, elapsed_compute=2.375µs] | +| | SortExec: TopK(fetch=5), expr=[wid@0 ASC NULLS LAST,ip@1 DESC], preserve_partitioning=[true], metrics=[output_rows=75, elapsed_compute=7.243038ms, row_replacements=482] | +| | ProjectionExec: expr=[WatchID@0 as wid, ClientIP@1 as ip], metrics=[output_rows=811821, elapsed_compute=66.25µs] | +| | FilterExec: starts_with(URL@2, http://domcheloveplanet.ru/), metrics=[output_rows=811821, elapsed_compute=1.36923816s] | +| | ParquetExec: file_groups={16 groups: [[hits.parquet:0..923748528], ...]}, projection=[WatchID, ClientIP, URL], predicate=starts_with(URL@13, http://domcheloveplanet.ru/), metrics=[output_rows=99997497, elapsed_compute=16ns, ... bytes_scanned=3703192723, ... time_elapsed_opening=308.203002ms, time_elapsed_scanning_total=8.350342183s, ...] | ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +1 row(s) fetched. +Elapsed 0.720 seconds. +``` + +In this case, DataFusion actually ran the query, but discarded any results, and +instead returned an annotated plan with a new field, `metrics=[...]` + +Most operators have the common metrics `output_rows` and `elapsed_compute` and +some have operator specific metrics such as `ParquetExec` which has +`bytes_scanned=3703192723`. Note that times and counters are reported across all +cores, so if you have 16 cores, the time reported is the sum of the time taken +by all 16 cores. + +Again, reading from bottom up: + +- `ParquetExec` + - `output_rows=99997497`: A total 99.9M rows were produced + - `bytes_scanned=3703192723`: Of the 14GB file, 3.7GB were actually read (due to projection pushdown) + - `time_elapsed_opening=308.203002ms`: It took 300ms to open the file and prepare to read it + - `time_elapsed_scanning_total=8.350342183s`: It took 8.3 seconds of CPU time (across 16 cores) to actually decode the parquet data +- `FilterExec` + - `output_rows=811821`: Of the 99.9M rows at its input, only 811K rows passed the filter and were produced at the output + - `elapsed_compute=1.36923816s`: In total, 1.36s of CPU time (across 16 cores) was spend evaluating the filter +- `CoalesceBatchesExec` + - `output_rows=811821`, `elapsed_compute=12.873379ms`: Produced 811K rows in 13ms +- `ProjectionExec` + - `output_rows=811821, elapsed_compute=66.25µs`: Produced 811K rows in 66µs (microseconds). This projection is almost instantaneous as it does not manipulate any data +- `SortExec` + - `output_rows=75`: Produced 75 rows in total. Each of 16 cores could produce up to 5 rows, but in this case not all cores did. + - `elapsed_compute=7.243038ms`: 7ms was used to determine the top 5 rows + - `row_replacements=482`: Internally, the TopK operator updated its top list 482 times +- `SortPreservingMergeExec` + - `output_rows=5`, `elapsed_compute=2.375µs`: Produced the final 5 rows in 2.375µs (microseconds) + +## Partitions and Execution + +DataFusion determines the optimal number of cores to use as part of query +planning. Roughly speaking, each "partition" in the plan is run independently using +a separate core. Data crosses between cores only within certain operators such as +`RepartitionExec`, `CoalescePartitions` and `SortPreservingMergeExec` + +You can read more about this in the [Partitoning Docs]. + +[partitoning docs]: https://docs.rs/datafusion/latest/datafusion/physical_expr/enum.Partitioning.html + +## Example of an Aggregate Query + +Let us delve into an example query that aggregates data from the `hits.parquet` +file. For example, this query from ClickBench finds the top 10 users by their +number of hits: + +```sql +SELECT "UserID", COUNT(*) +FROM 'hits.parquet' +GROUP BY "UserID" +ORDER BY COUNT(*) DESC +LIMIT 10; +``` + +We can again see the query plan by using `EXPLAIN`: + +``` +> EXPLAIN SELECT "UserID", COUNT(*) FROM 'hits.parquet' GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Limit: skip=0, fetch=10 | +| | Sort: count(*) DESC NULLS FIRST, fetch=10 | +| | Aggregate: groupBy=[[hits.parquet.UserID]], aggr=[[count(Int64(1)) AS count(*)]] | +| | TableScan: hits.parquet projection=[UserID] | +| physical_plan | GlobalLimitExec: skip=0, fetch=10 | +| | SortPreservingMergeExec: [count(*)@1 DESC], fetch=10 | +| | SortExec: TopK(fetch=10), expr=[count(*)@1 DESC], preserve_partitioning=[true] | +| | AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID], aggr=[count(*)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([UserID@0], 10), input_partitions=10 | +| | AggregateExec: mode=Partial, gby=[UserID@0 as UserID], aggr=[count(*)] | +| | ParquetExec: file_groups={10 groups: [[hits.parquet:0..1477997645], [hits.parquet:1477997645..2955995290], [hits.parquet:2955995290..4433992935], [hits.parquet:4433992935..5911990580], [hits.parquet:5911990580..7389988225], ...]}, projection=[UserID] | +| | | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +For this query, let's again read the plan from the bottom to the top: + +**Logical plan operators** + +- `TableScan` + - `hits.parquet`: Scans data from the file `hits.parquet`. + - `projection=[UserID]`: Reads only the `UserID` column +- `Aggregate` + - `groupBy=[[hits.parquet.UserID]]`: Groups by `UserID` column. + - `aggr=[[count(Int64(1)) AS count(*)]]`: Applies the `COUNT` aggregate on each distinct group. +- `Sort` + - `count(*) DESC NULLS FIRST`: Sorts the data in descending count order. + - `fetch=10`: Returns only the first 10 rows. +- `Limit` + - `skip=0`: Does not skip any data for the results. + - `fetch=10`: Limits the results to 10 values. + +**Physical plan operators** + +- `ParquetExec` + - `file_groups={10 groups: [...]}`: Reads 10 groups in parallel from `hits.parquet`file. (The example above was run on a machine with 10 cores.) + - `projection=[UserID]`: Pushes down projection of the `UserID` column. The parquet format is columnar and the DataFusion reader only decodes the columns required. +- `AggregateExec` + - `mode=Partial` Runs a [partial aggregation] in parallel across each of the 10 partitions from the `ParquetExec` immediately after reading. + - `gby=[UserID@0 as UserID]`: Represents `GROUP BY` in the [physical plan] and groups together the same values of `UserID`. + - `aggr=[count(*)]`: Applies the `COUNT` aggregate on all rows for each group. +- `RepartitionExec` + - `partitioning=Hash([UserID@0], 10)`: Divides the input into into 10 (new) output partitions based on the value of `hash(UserID)`. You can read more about this in the [partitioning] documentation. + - `input_partitions=10`: Number of input partitions. +- `CoalesceBatchesExec` + - `target_batch_size=8192`: Combines smaller batches in to larger batches. In this case approximately 8192 rows in each batch. +- `AggregateExec` + - `mode=FinalPartitioned`: Performs the final aggregation on each group. See the [documentation on multi phase grouping] for more information. + - `gby=[UserID@0 as UserID]`: Groups by `UserID`. + - `aggr=[count(*)]`: Applies the `COUNT` aggregate on all rows for each group. +- `SortExec` + - `TopK(fetch=10)`: Use a special "TopK" sort that keeps only the largest 10 values in memory at a time. You can read more about this in the [TopK] documentation. + - `expr=[count(*)@1 DESC]`: Sorts all rows in descending order. Note this represents the `ORDER BY` in the physical plan. + - `preserve_partitioning=[true]`: The sort is done in parallel on each partition. In this case the top 10 values are found for each of the 10 partitions, in parallel. +- `SortPreservingMergeExec` + - `[count(*)@1 DESC]`: This operator merges the 10 distinct streams into a single stream using this expression. + - `fetch=10`: Returns only the first 10 rows +- `GlobalLimitExec` + - `skip=0`: Does not skip any rows + - `fetch=10`: Returns only the first 10 rows, denoted by `LIMIT 10` in the query. + +[partial aggregation]: https://docs.rs/datafusion/latest/datafusion/physical_plan/aggregates/enum.AggregateMode.html#variant.Partial +[physical plan]: https://docs.rs/datafusion/latest/datafusion/physical_plan/aggregates/struct.PhysicalGroupBy.html +[partitioning]: https://docs.rs/datafusion/latest/datafusion/physical_plan/repartition/struct.RepartitionExec.html +[topk]: https://docs.rs/datafusion/latest/datafusion/physical_plan/struct.TopK.html +[documentation on multi phase grouping]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.Accumulator.html#tymethod.state + +### Data in this Example + +The examples in this section use data from [ClickBench], a benchmark for data +analytics. The examples are in terms of the 14GB [`hits.parquet`] file and can be +downloaded from the website or using the following commands: + +```shell +cd benchmarks +./bench.sh data clickbench_1 +*************************** +DataFusion Benchmark Runner and Data Generator +COMMAND: data +BENCHMARK: clickbench_1 +DATA_DIR: /Users/andrewlamb/Software/datafusion2/benchmarks/data +CARGO_COMMAND: cargo run --release +PREFER_HASH_JOIN: true +*************************** +Checking hits.parquet...... found 14779976446 bytes ... Done +``` + +Then you can run `datafusion-cli` to get plans: + +```shell +cd datafusion/benchmarks/data +datafusion-cli + +DataFusion CLI v41.0.0 +> select count(*) from 'hits.parquet'; ++----------+ +| count(*) | ++----------+ +| 99997497 | ++----------+ +1 row(s) fetched. +Elapsed 0.062 seconds. +> +``` + +[clickbench]: https://benchmark.clickhouse.com/ +[`hits.parquet`]: https://datasets.clickhouse.com/hits_compatible/hits.parquet diff --git a/docs/source/user-guide/sql/explain.md b/docs/source/user-guide/sql/explain.md index 22f73e3d76d7..45bb3a57aa7c 100644 --- a/docs/source/user-guide/sql/explain.md +++ b/docs/source/user-guide/sql/explain.md @@ -21,6 +21,8 @@ The `EXPLAIN` command shows the logical and physical execution plan for the specified SQL statement. +See the [Reading Explain Plans](../explain-usage.md) page for more information on how to interpret these plans. +
 EXPLAIN [ANALYZE] [VERBOSE] statement
 
From 5163e1559f9d84d04bdd967adf34ad07e63b249d Mon Sep 17 00:00:00 2001 From: WeblWabl Date: Wed, 28 Aug 2024 07:29:16 -0500 Subject: [PATCH 23/23] Make RuntimeEnvBuilder rather than RuntimeConfig (#12157) * feat/12156: Make RuntimeEnvBuilder rather than RuntimeConfig Signed-off-by: Devan * feat/12156: Make RuntimeEnvBuilder rather than RuntimeConfig Signed-off-by: Devan * doc link Signed-off-by: Devan * update to use builder for rt env Signed-off-by: Devan * update to use builder Signed-off-by: Devan * clippy Signed-off-by: Devan * touch Signed-off-by: Devan * fmt Signed-off-by: Devan * revert some formatting that occurred Signed-off-by: Devan * revert some formatting that occurred Signed-off-by: Devan * use builder Signed-off-by: Devan * fmt Signed-off-by: Devan * Update datafusion/execution/src/runtime_env.rs Co-authored-by: Andrew Lamb --------- Signed-off-by: Devan Co-authored-by: Devan Co-authored-by: Andrew Lamb --- .../core/src/datasource/file_format/csv.rs | 4 +- datafusion/core/src/execution/context/mod.rs | 13 +++--- datafusion/core/tests/fuzz_cases/sort_fuzz.rs | 12 +++--- datafusion/core/tests/memory_limit/mod.rs | 12 +++--- .../core/tests/parquet/file_statistics.rs | 7 ++- datafusion/execution/src/runtime_env.rs | 27 +++++++++--- datafusion/execution/src/task.rs | 5 ++- .../physical-plan/src/aggregates/mod.rs | 15 ++++--- .../physical-plan/src/joins/cross_join.rs | 9 ++-- .../physical-plan/src/joins/hash_join.rs | 16 ++++--- .../src/joins/nested_loop_join.rs | 9 ++-- .../src/joins/sort_merge_join.rs | 43 +++++++++++-------- .../physical-plan/src/repartition/mod.rs | 6 ++- datafusion/physical-plan/src/sorts/sort.rs | 21 +++++---- datafusion/wasmtest/src/lib.rs | 13 +++--- 15 files changed, 129 insertions(+), 83 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index d1ce2afcccf3..e43f6ab29abc 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -680,7 +680,7 @@ mod tests { use datafusion_common::cast::as_string_array; use datafusion_common::internal_err; use datafusion_common::stats::Precision; - use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_expr::{col, lit}; use crate::execution::session_state::SessionStateBuilder; @@ -863,7 +863,7 @@ mod tests { async fn query_compress_data( file_compression_type: FileCompressionType, ) -> Result<()> { - let runtime = Arc::new(RuntimeEnv::new(RuntimeConfig::new()).unwrap()); + let runtime = Arc::new(RuntimeEnvBuilder::new().build()?); let mut cfg = SessionConfig::new(); cfg.options_mut().catalog.has_header = true; let session_state = SessionStateBuilder::new() diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 18943599b136..c67424c0fa53 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -212,15 +212,15 @@ where /// # use std::sync::Arc; /// # use datafusion::prelude::*; /// # use datafusion::execution::SessionStateBuilder; -/// # use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +/// # use datafusion_execution::runtime_env::RuntimeEnvBuilder; /// // Configure a 4k batch size /// let config = SessionConfig::new() .with_batch_size(4 * 1024); /// /// // configure a memory limit of 1GB with 20% slop -/// let runtime_env = RuntimeEnv::new( -/// RuntimeConfig::new() +/// let runtime_env = RuntimeEnvBuilder::new() /// .with_memory_limit(1024 * 1024 * 1024, 0.80) -/// ).unwrap(); +/// .build() +/// .unwrap(); /// /// // Create a SessionState using the config and runtime_env /// let state = SessionStateBuilder::new() @@ -1623,7 +1623,7 @@ mod tests { use super::{super::options::CsvReadOptions, *}; use crate::assert_batches_eq; use crate::execution::memory_pool::MemoryConsumer; - use crate::execution::runtime_env::RuntimeConfig; + use crate::execution::runtime_env::RuntimeEnvBuilder; use crate::test; use crate::test_util::{plan_and_collect, populate_csv_partitions}; @@ -1758,8 +1758,7 @@ mod tests { let path = path.join("tests/tpch-csv"); let url = format!("file://{}", path.display()); - let rt_cfg = RuntimeConfig::new(); - let runtime = Arc::new(RuntimeEnv::new(rt_cfg).unwrap()); + let runtime = Arc::new(RuntimeEnvBuilder::new().build()?); let cfg = SessionConfig::new() .set_str("datafusion.catalog.location", url.as_str()) .set_str("datafusion.catalog.format", "CSV") diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index eda306dd3de5..1980589491a5 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -22,7 +22,7 @@ use arrow::{ compute::SortOptions, record_batch::RecordBatch, }; -use datafusion::execution::runtime_env::RuntimeConfig; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::physical_plan::expressions::PhysicalSortExpr; use datafusion::physical_plan::memory::MemoryExec; use datafusion::physical_plan::sorts::sort::SortExec; @@ -136,10 +136,12 @@ impl SortTest { .sort_spill_reservation_bytes, ); - let runtime_env = RuntimeConfig::new() - .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))) - .build(); - let runtime = Arc::new(runtime_env.unwrap()); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))) + .build() + .unwrap(), + ); SessionContext::new_with_config_rt(session_config, runtime) } else { SessionContext::new_with_config(session_config) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index dbd5592e8020..592c25dedc50 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -40,7 +40,7 @@ use tokio::fs::File; use datafusion::datasource::streaming::StreamingTable; use datafusion::datasource::{MemTable, TableProvider}; use datafusion::execution::disk_manager::DiskManagerConfig; -use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::session_state::SessionStateBuilder; use datafusion::physical_optimizer::join_selection::JoinSelection; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; @@ -509,17 +509,17 @@ impl TestCase { let table = scenario.table(); - let mut rt_config = RuntimeConfig::new() + let rt_config = RuntimeEnvBuilder::new() // disk manager setting controls the spilling .with_disk_manager(disk_manager_config) .with_memory_limit(memory_limit, MEMORY_FRACTION); - if let Some(pool) = memory_pool { - rt_config = rt_config.with_memory_pool(pool); + let runtime = if let Some(pool) = memory_pool { + rt_config.with_memory_pool(pool).build().unwrap() + } else { + rt_config.build().unwrap() }; - let runtime = RuntimeEnv::new(rt_config).unwrap(); - // Configure execution let builder = SessionStateBuilder::new() .with_config(config) diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index bf25b36f48e8..bd251f1a6669 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -33,7 +33,7 @@ use datafusion_execution::cache::cache_unit::{ DefaultFileStatisticsCache, DefaultListFilesCache, }; use datafusion_execution::config::SessionConfig; -use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::session_state::SessionStateBuilder; use tempfile::tempdir; @@ -198,7 +198,10 @@ fn get_cache_runtime_state() -> ( .with_list_files_cache(Some(list_file_cache.clone())); let rt = Arc::new( - RuntimeEnv::new(RuntimeConfig::new().with_cache_manager(cache_config)).unwrap(), + RuntimeEnvBuilder::new() + .with_cache_manager(cache_config) + .build() + .expect("could not build runtime environment"), ); let state = SessionContext::new_with_config_rt(SessionConfig::default(), rt).state(); diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs index bddce81f537b..e7b48be95cff 100644 --- a/datafusion/execution/src/runtime_env.rs +++ b/datafusion/execution/src/runtime_env.rs @@ -41,7 +41,7 @@ use url::Url; /// Execution runtime environment that manages system resources such /// as memory, disk, cache and storage. /// -/// A [`RuntimeEnv`] is created from a [`RuntimeConfig`] and has the +/// A [`RuntimeEnv`] is created from a [`RuntimeEnvBuilder`] and has the /// following resource management functionality: /// /// * [`MemoryPool`]: Manage memory @@ -147,13 +147,17 @@ impl RuntimeEnv { impl Default for RuntimeEnv { fn default() -> Self { - RuntimeEnv::new(RuntimeConfig::new()).unwrap() + RuntimeEnvBuilder::new().build().unwrap() } } +/// Please see: +/// This a type alias for backwards compatibility. +pub type RuntimeConfig = RuntimeEnvBuilder; + #[derive(Clone)] /// Execution runtime configuration -pub struct RuntimeConfig { +pub struct RuntimeEnvBuilder { /// DiskManager to manage temporary disk file usage pub disk_manager: DiskManagerConfig, /// [`MemoryPool`] from which to allocate memory @@ -166,13 +170,13 @@ pub struct RuntimeConfig { pub object_store_registry: Arc, } -impl Default for RuntimeConfig { +impl Default for RuntimeEnvBuilder { fn default() -> Self { Self::new() } } -impl RuntimeConfig { +impl RuntimeEnvBuilder { /// New with default values pub fn new() -> Self { Self { @@ -229,8 +233,17 @@ impl RuntimeConfig { self.with_disk_manager(DiskManagerConfig::new_specified(vec![path.into()])) } - /// Build a `RuntimeEnv` object from the configuration + /// Build a RuntimeEnv pub fn build(self) -> Result { - RuntimeEnv::new(self) + let memory_pool = self + .memory_pool + .unwrap_or_else(|| Arc::new(UnboundedMemoryPool::default())); + + Ok(RuntimeEnv { + memory_pool, + disk_manager: DiskManager::try_new(self.disk_manager)?, + cache_manager: CacheManager::try_new(&self.cache_manager)?, + object_store_registry: self.object_store_registry, + }) } } diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs index 21a644284c42..35689b8e08df 100644 --- a/datafusion/execution/src/task.rs +++ b/datafusion/execution/src/task.rs @@ -24,7 +24,7 @@ use crate::{ config::SessionConfig, memory_pool::MemoryPool, registry::FunctionRegistry, - runtime_env::{RuntimeConfig, RuntimeEnv}, + runtime_env::{RuntimeEnv, RuntimeEnvBuilder}, }; use datafusion_common::{plan_datafusion_err, DataFusionError, Result}; use datafusion_expr::planner::ExprPlanner; @@ -57,7 +57,8 @@ pub struct TaskContext { impl Default for TaskContext { fn default() -> Self { - let runtime = RuntimeEnv::new(RuntimeConfig::new()) + let runtime = RuntimeEnvBuilder::new() + .build() .expect("default runtime created successfully"); // Create a default task context, mostly useful for testing diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 6ca01928c80f..b5447a2d7317 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1212,7 +1212,7 @@ mod tests { }; use datafusion_execution::config::SessionConfig; use datafusion_execution::memory_pool::FairSpillPool; - use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_functions_aggregate::array_agg::array_agg_udaf; use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::count::count_udaf; @@ -1324,11 +1324,10 @@ mod tests { fn new_spill_ctx(batch_size: usize, max_memory: usize) -> Arc { let session_config = SessionConfig::new().with_batch_size(batch_size); let runtime = Arc::new( - RuntimeEnv::new( - RuntimeConfig::default() - .with_memory_pool(Arc::new(FairSpillPool::new(max_memory))), - ) - .unwrap(), + RuntimeEnvBuilder::default() + .with_memory_pool(Arc::new(FairSpillPool::new(max_memory))) + .build() + .unwrap(), ); let task_ctx = TaskContext::default() .with_session_config(session_config) @@ -1809,7 +1808,9 @@ mod tests { let input_schema = input.schema(); let runtime = Arc::new( - RuntimeEnv::new(RuntimeConfig::default().with_memory_limit(1, 1.0)).unwrap(), + RuntimeEnvBuilder::default() + .with_memory_limit(1, 1.0) + .build()?, ); let task_ctx = TaskContext::default().with_runtime(runtime); let task_ctx = Arc::new(task_ctx); diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 0868ee721665..b99d4f17c42a 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -488,7 +488,7 @@ mod tests { use crate::test::build_table_scan_i32; use datafusion_common::{assert_batches_sorted_eq, assert_contains}; - use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + use datafusion_execution::runtime_env::RuntimeEnvBuilder; async fn join_collect( left: Arc, @@ -673,8 +673,11 @@ mod tests { #[tokio::test] async fn test_overallocation() -> Result<()> { - let runtime_config = RuntimeConfig::new().with_memory_limit(100, 1.0); - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(100, 1.0) + .build()?, + ); let task_ctx = TaskContext::default().with_runtime(runtime); let task_ctx = Arc::new(task_ctx); diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 7fac23ad5557..f20d00e1a298 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -1572,7 +1572,7 @@ mod tests { ScalarValue, }; use datafusion_execution::config::SessionConfig; - use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::{BinaryExpr, Literal}; @@ -3798,8 +3798,11 @@ mod tests { ]; for join_type in join_types { - let runtime_config = RuntimeConfig::new().with_memory_limit(100, 1.0); - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(100, 1.0) + .build()?, + ); let task_ctx = TaskContext::default().with_runtime(runtime); let task_ctx = Arc::new(task_ctx); @@ -3871,8 +3874,11 @@ mod tests { ]; for join_type in join_types { - let runtime_config = RuntimeConfig::new().with_memory_limit(100, 1.0); - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(100, 1.0) + .build()?, + ); let session_config = SessionConfig::default().with_batch_size(50); let task_ctx = TaskContext::default() .with_session_config(session_config) diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 18de2de03192..3cd373544157 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -644,7 +644,7 @@ mod tests { use arrow::datatypes::{DataType, Field}; use datafusion_common::{assert_batches_sorted_eq, assert_contains, ScalarValue}; - use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::{BinaryExpr, Literal}; use datafusion_physical_expr::{Partitioning, PhysicalExpr}; @@ -1019,8 +1019,11 @@ mod tests { ]; for join_type in join_types { - let runtime_config = RuntimeConfig::new().with_memory_limit(100, 1.0); - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(100, 1.0) + .build()?, + ); let task_ctx = TaskContext::default().with_runtime(runtime); let task_ctx = Arc::new(task_ctx); diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index 511cb4c55fcd..09fe5d9ebc54 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -1978,7 +1978,7 @@ mod tests { }; use datafusion_execution::config::SessionConfig; use datafusion_execution::disk_manager::DiskManagerConfig; - use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_execution::TaskContext; use crate::expressions::Column; @@ -2900,10 +2900,12 @@ mod tests { ]; // Disable DiskManager to prevent spilling - let runtime_config = RuntimeConfig::new() - .with_memory_limit(100, 1.0) - .with_disk_manager(DiskManagerConfig::Disabled); - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(100, 1.0) + .with_disk_manager(DiskManagerConfig::Disabled) + .build()?, + ); let session_config = SessionConfig::default().with_batch_size(50); for join_type in join_types { @@ -2985,10 +2987,12 @@ mod tests { ]; // Disable DiskManager to prevent spilling - let runtime_config = RuntimeConfig::new() - .with_memory_limit(100, 1.0) - .with_disk_manager(DiskManagerConfig::Disabled); - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(100, 1.0) + .with_disk_manager(DiskManagerConfig::Disabled) + .build()?, + ); let session_config = SessionConfig::default().with_batch_size(50); for join_type in join_types { @@ -3048,10 +3052,12 @@ mod tests { ]; // Enable DiskManager to allow spilling - let runtime_config = RuntimeConfig::new() - .with_memory_limit(100, 1.0) - .with_disk_manager(DiskManagerConfig::NewOs); - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(100, 1.0) + .with_disk_manager(DiskManagerConfig::NewOs) + .build()?, + ); for batch_size in [1, 50] { let session_config = SessionConfig::default().with_batch_size(batch_size); @@ -3156,10 +3162,13 @@ mod tests { ]; // Enable DiskManager to allow spilling - let runtime_config = RuntimeConfig::new() - .with_memory_limit(500, 1.0) - .with_disk_manager(DiskManagerConfig::NewOs); - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(500, 1.0) + .with_disk_manager(DiskManagerConfig::NewOs) + .build()?, + ); + for batch_size in [1, 50] { let session_config = SessionConfig::default().with_batch_size(batch_size); diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 5a3fcb5029e1..650006a9d02d 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1025,7 +1025,7 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::cast::as_string_array; use datafusion_common::{assert_batches_sorted_eq, exec_err}; - use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + use datafusion_execution::runtime_env::RuntimeEnvBuilder; use tokio::task::JoinSet; @@ -1507,7 +1507,9 @@ mod tests { // setup up context let runtime = Arc::new( - RuntimeEnv::new(RuntimeConfig::default().with_memory_limit(1, 1.0)).unwrap(), + RuntimeEnvBuilder::default() + .with_memory_limit(1, 1.0) + .build()?, ); let task_ctx = TaskContext::default().with_runtime(runtime); diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index a81b09948cca..e92a57493141 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -966,7 +966,7 @@ mod tests { use arrow::datatypes::*; use datafusion_common::cast::as_primitive_array; use datafusion_execution::config::SessionConfig; - use datafusion_execution::runtime_env::RuntimeConfig; + use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_common::ScalarValue; use datafusion_physical_expr::expressions::Literal; @@ -1009,9 +1009,11 @@ mod tests { .options() .execution .sort_spill_reservation_bytes; - let rt_config = RuntimeConfig::new() - .with_memory_limit(sort_spill_reservation_bytes + 12288, 1.0); - let runtime = Arc::new(RuntimeEnv::new(rt_config)?); + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit(sort_spill_reservation_bytes + 12288, 1.0) + .build()?, + ); let task_ctx = Arc::new( TaskContext::default() .with_session_config(session_config) @@ -1085,11 +1087,14 @@ mod tests { .execution .sort_spill_reservation_bytes; - let rt_config = RuntimeConfig::new().with_memory_limit( - sort_spill_reservation_bytes + avg_batch_size * (partitions - 1), - 1.0, + let runtime = Arc::new( + RuntimeEnvBuilder::new() + .with_memory_limit( + sort_spill_reservation_bytes + avg_batch_size * (partitions - 1), + 1.0, + ) + .build()?, ); - let runtime = Arc::new(RuntimeEnv::new(rt_config)?); let task_ctx = Arc::new( TaskContext::default() .with_runtime(runtime) diff --git a/datafusion/wasmtest/src/lib.rs b/datafusion/wasmtest/src/lib.rs index a74cce72ac64..50325d262d1d 100644 --- a/datafusion/wasmtest/src/lib.rs +++ b/datafusion/wasmtest/src/lib.rs @@ -78,9 +78,8 @@ mod test { use super::*; use datafusion::execution::context::SessionContext; use datafusion_execution::{ - config::SessionConfig, - disk_manager::DiskManagerConfig, - runtime_env::{RuntimeConfig, RuntimeEnv}, + config::SessionConfig, disk_manager::DiskManagerConfig, + runtime_env::RuntimeEnvBuilder, }; use datafusion_physical_plan::collect; use datafusion_sql::parser::DFParser; @@ -100,10 +99,10 @@ mod test { // Execute SQL (using datafusion) let rt = Arc::new( - RuntimeEnv::new( - RuntimeConfig::new().with_disk_manager(DiskManagerConfig::Disabled), - ) - .unwrap(), + RuntimeEnvBuilder::new() + .with_disk_manager(DiskManagerConfig::Disabled) + .build() + .unwrap(), ); let session_config = SessionConfig::new().with_target_partitions(1); let session_context =