apache · jayzhan211 · Jun 16, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs
@@ -390,6 +390,7 @@ pub(crate) mod tests {
                 &[self.column()],
                 &[],
                 &[],
+                &[],
                 schema,
                 self.column_name(),
                 false,

diff --git a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs
@@ -315,6 +315,7 @@ mod tests {
             &[expr],
             &[],
             &[],
+            &[],
             schema,
             name,
             false,
@@ -404,6 +405,7 @@ mod tests {
             &[col("b", &schema)?],
             &[],
             &[],
+            &[],
             &schema,
             "Sum(b)",
             false,

diff --git a/datafusion/core/src/physical_optimizer/test_utils.rs b/datafusion/core/src/physical_optimizer/test_utils.rs
@@ -245,6 +245,7 @@ pub fn bounded_window_exec(
                 "count".to_owned(),
                 &[col(col_name, &schema).unwrap()],
                 &[],
+                &[],
                 &sort_exprs,
                 Arc::new(WindowFrame::new(Some(false))),
                 schema.as_ref(),

diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
@@ -1766,7 +1766,8 @@ pub fn create_window_expr_with_name(
             window_frame,
             null_treatment,
         }) => {
-            let args = create_physical_exprs(args, logical_schema, execution_props)?;
+            let physical_args =
+                create_physical_exprs(args, logical_schema, execution_props)?;
             let partition_by =
                 create_physical_exprs(partition_by, logical_schema, execution_props)?;
             let order_by =
@@ -1780,13 +1781,13 @@ pub fn create_window_expr_with_name(
             }
 
             let window_frame = Arc::new(window_frame.clone());
-            let ignore_nulls = null_treatment
-                .unwrap_or(sqlparser::ast::NullTreatment::RespectNulls)
+            let ignore_nulls = null_treatment.unwrap_or(NullTreatment::RespectNulls)
                 == NullTreatment::IgnoreNulls;
             windows::create_window_expr(
                 fun,
                 name,
-                &args,
+                &physical_args,
+                args,
                 &partition_by,
                 &order_by,
                 window_frame,
@@ -1837,7 +1838,7 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter(
             order_by,
             null_treatment,
         }) => {
-            let args =
+            let physical_args =
                 create_physical_exprs(args, logical_input_schema, execution_props)?;
             let filter = match filter {
                 Some(e) => Some(create_physical_expr(
@@ -1867,7 +1868,7 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter(
                     let agg_expr = aggregates::create_aggregate_expr(
                         fun,
                         *distinct,
-                        &args,
+                        &physical_args,
                         &ordering_reqs,
                         physical_input_schema,
                         name,
@@ -1889,7 +1890,8 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter(
                         physical_sort_exprs.clone().unwrap_or(vec![]);
                     let agg_expr = udaf::create_aggregate_expr(
                         fun,
-                        &args,
+                        &physical_args,
+                        args,
                         &sort_exprs,
                         &ordering_reqs,
                         physical_input_schema,

diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -33,7 +33,7 @@ use datafusion::assert_batches_eq;
 use datafusion_common::{DFSchema, ScalarValue};
 use datafusion_expr::expr::Alias;
 use datafusion_expr::ExprSchemable;
-use datafusion_functions_aggregate::expr_fn::approx_median;
+use datafusion_functions_aggregate::expr_fn::{approx_median, approx_percentile_cont};
 
 fn test_schema() -> SchemaRef {
     Arc::new(Schema::new(vec![

diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
@@ -108,6 +108,7 @@ async fn run_aggregate_test(input1: Vec<RecordBatch>, group_by_columns: Vec<&str
         &[col("d", &schema).unwrap()],
         &[],
         &[],
+        &[],
         &schema,
         "sum1",
         false,

diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs
@@ -252,6 +252,7 @@ async fn bounded_window_causal_non_causal() -> Result<()> {
 
     let partitionby_exprs = vec![];
     let orderby_exprs = vec![];
+    let logical_exprs = vec![];
     // Window frame starts with "UNBOUNDED PRECEDING":
     let start_bound = WindowFrameBound::Preceding(ScalarValue::UInt64(None));
 
@@ -283,6 +284,7 @@ async fn bounded_window_causal_non_causal() -> Result<()> {
                     &window_fn,
                     fn_name.to_string(),
                     &args,
+                    &logical_exprs,
                     &partitionby_exprs,
                     &orderby_exprs,
                     Arc::new(window_frame),
@@ -699,6 +701,7 @@ async fn run_window_test(
             &window_fn,
             fn_name.clone(),
             &args,
+            &[],
             &partitionby_exprs,
             &orderby_exprs,
             Arc::new(window_frame.clone()),
@@ -717,6 +720,7 @@ async fn run_window_test(
             &window_fn,
             fn_name,
             &args,
+            &[],
             &partitionby_exprs,
             &orderby_exprs,
             Arc::new(window_frame.clone()),

diff --git a/datafusion/expr/src/aggregate_function.rs b/datafusion/expr/src/aggregate_function.rs
@@ -21,7 +21,7 @@ use std::sync::Arc;
 use std::{fmt, str::FromStr};
 
 use crate::utils;
-use crate::{type_coercion::aggregates::*, Signature, TypeSignature, Volatility};
+use crate::{type_coercion::aggregates::*, Signature, Volatility};
 
 use arrow::datatypes::{DataType, Field};
 use datafusion_common::{plan_datafusion_err, plan_err, DataFusionError, Result};
@@ -45,10 +45,6 @@ pub enum AggregateFunction {
     NthValue,
     /// Correlation
     Correlation,
-    /// Approximate continuous percentile function
-    ApproxPercentileCont,
-    /// Approximate continuous percentile function with weight
-    ApproxPercentileContWithWeight,
     /// Grouping
     Grouping,
     /// Bit And
@@ -75,8 +71,6 @@ impl AggregateFunction {
             ArrayAgg => "ARRAY_AGG",
             NthValue => "NTH_VALUE",
             Correlation => "CORR",
-            ApproxPercentileCont => "APPROX_PERCENTILE_CONT",
-            ApproxPercentileContWithWeight => "APPROX_PERCENTILE_CONT_WITH_WEIGHT",
             Grouping => "GROUPING",
             BitAnd => "BIT_AND",
             BitOr => "BIT_OR",
@@ -113,11 +107,6 @@ impl FromStr for AggregateFunction {
             "string_agg" => AggregateFunction::StringAgg,
             // statistical
             "corr" => AggregateFunction::Correlation,
-            // approximate
-            "approx_percentile_cont" => AggregateFunction::ApproxPercentileCont,
-            "approx_percentile_cont_with_weight" => {
-                AggregateFunction::ApproxPercentileContWithWeight
-            }
             // other
             "grouping" => AggregateFunction::Grouping,
             _ => {
@@ -170,10 +159,6 @@ impl AggregateFunction {
                 coerced_data_types[0].clone(),
                 true,
             )))),
-            AggregateFunction::ApproxPercentileCont => Ok(coerced_data_types[0].clone()),
-            AggregateFunction::ApproxPercentileContWithWeight => {
-                Ok(coerced_data_types[0].clone())
-            }
             AggregateFunction::Grouping => Ok(DataType::Int32),
             AggregateFunction::NthValue => Ok(coerced_data_types[0].clone()),
             AggregateFunction::StringAgg => Ok(DataType::LargeUtf8),
@@ -230,39 +215,6 @@ impl AggregateFunction {
             AggregateFunction::Correlation => {
                 Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable)
             }
-            AggregateFunction::ApproxPercentileCont => {
-                let mut variants =
-                    Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1));
-                // Accept any numeric value paired with a float64 percentile
-                for num in NUMERICS {
-                    variants
-                        .push(TypeSignature::Exact(vec![num.clone(), DataType::Float64]));
-                    // Additionally accept an integer number of centroids for T-Digest
-                    for int in INTEGERS {
-                        variants.push(TypeSignature::Exact(vec![
-                            num.clone(),
-                            DataType::Float64,
-                            int.clone(),
-                        ]))
-                    }
-                }
-
-                Signature::one_of(variants, Volatility::Immutable)
-            }
-            AggregateFunction::ApproxPercentileContWithWeight => Signature::one_of(
-                // Accept any numeric value paired with a float64 percentile
-                NUMERICS
-                    .iter()
-                    .map(|t| {
-                        TypeSignature::Exact(vec![
-                            t.clone(),
-                            t.clone(),
-                            DataType::Float64,
-                        ])
-                    })
-                    .collect(),
-                Volatility::Immutable,
-            ),
             AggregateFunction::StringAgg => {
                 Signature::uniform(2, STRINGS.to_vec(), Volatility::Immutable)
             }

diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
@@ -242,34 +242,6 @@ pub fn in_list(expr: Expr, list: Vec<Expr>, negated: bool) -> Expr {
     Expr::InList(InList::new(Box::new(expr), list, negated))
 }
 
-/// Calculate an approximation of the specified `percentile` for `expr`.
-pub fn approx_percentile_cont(expr: Expr, percentile: Expr) -> Expr {
-    Expr::AggregateFunction(AggregateFunction::new(
-        aggregate_function::AggregateFunction::ApproxPercentileCont,
-        vec![expr, percentile],
-        false,
-        None,
-        None,
-        None,
-    ))
-}
-
-/// Calculate an approximation of the specified `percentile` for `expr` and `weight_expr`.
-pub fn approx_percentile_cont_with_weight(
-    expr: Expr,
-    weight_expr: Expr,
-    percentile: Expr,
-) -> Expr {
-    Expr::AggregateFunction(AggregateFunction::new(
-        aggregate_function::AggregateFunction::ApproxPercentileContWithWeight,
-        vec![expr, weight_expr, percentile],
-        false,
-        None,
-        None,
-        None,
-    ))
-}
-
 /// Create an EXISTS subquery expression
 pub fn exists(subquery: Arc<LogicalPlan>) -> Expr {
     let outer_ref_columns = subquery.all_out_ref_exprs();

diff --git a/datafusion/expr/src/function.rs b/datafusion/expr/src/function.rs
@@ -83,8 +83,8 @@ pub struct AccumulatorArgs<'a> {
     /// The input type of the aggregate function.
     pub input_type: &'a DataType,
 
-    /// The number of arguments the aggregate function takes.
-    pub args_num: usize,
+    /// The logical expression of arguments the aggregate function takes.
+    pub args: &'a [Expr],
 }
 
 /// [`StateFieldsArgs`] contains information about the fields that an

diff --git a/datafusion/expr/src/type_coercion/aggregates.rs b/datafusion/expr/src/type_coercion/aggregates.rs
@@ -17,7 +17,6 @@
 
 use std::ops::Deref;
 
-use super::functions::can_coerce_from;
 use crate::{AggregateFunction, Signature, TypeSignature};
 
 use arrow::datatypes::{
@@ -158,55 +157,6 @@ pub fn coerce_types(
             }
             Ok(vec![Float64, Float64])
         }
-        AggregateFunction::ApproxPercentileCont => {
-            if !is_approx_percentile_cont_supported_arg_type(&input_types[0]) {
-                return plan_err!(
-                    "The function {:?} does not support inputs of type {:?}.",
-                    agg_fun,
-                    input_types[0]
-                );
-            }
-            if input_types.len() == 3 && !input_types[2].is_integer() {
-                return plan_err!(
-                        "The percentile sample points count for {:?} must be integer, not {:?}.",
-                        agg_fun, input_types[2]
-                    );
-            }
-            let mut result = input_types.to_vec();
-            if can_coerce_from(&Float64, &input_types[1]) {
-                result[1] = Float64;
-            } else {
-                return plan_err!(
-                    "Could not coerce the percent argument for {:?} to Float64. Was  {:?}.",
-                    agg_fun, input_types[1]
-                );
-            }
-            Ok(result)
-        }
-        AggregateFunction::ApproxPercentileContWithWeight => {
-            if !is_approx_percentile_cont_supported_arg_type(&input_types[0]) {
-                return plan_err!(
-                    "The function {:?} does not support inputs of type {:?}.",
-                    agg_fun,
-                    input_types[0]
-                );
-            }
-            if !is_approx_percentile_cont_supported_arg_type(&input_types[1]) {
-                return plan_err!(
-                    "The weight argument for {:?} does not support inputs of type {:?}.",
-                    agg_fun,
-                    input_types[1]
-                );
-            }
-            if !matches!(input_types[2], Float64) {
-                return plan_err!(
-                    "The percentile argument for {:?} must be Float64, not {:?}.",
-                    agg_fun,
-                    input_types[2]
-                );
-            }
-            Ok(input_types.to_vec())
-        }
         AggregateFunction::NthValue => Ok(input_types.to_vec()),
         AggregateFunction::Grouping => Ok(vec![input_types[0].clone()]),
         AggregateFunction::StringAgg => {
@@ -532,29 +482,6 @@ mod tests {
         assert_eq!(r[0], DataType::Decimal128(20, 3));
         let r = coerce_types(&fun, &[DataType::Decimal256(20, 3)], &signature).unwrap();
         assert_eq!(r[0], DataType::Decimal256(20, 3));
-
-        // ApproxPercentileCont input types
-        let input_types = vec![
-            vec![DataType::Int8, DataType::Float64],
-            vec![DataType::Int16, DataType::Float64],
-            vec![DataType::Int32, DataType::Float64],
-            vec![DataType::Int64, DataType::Float64],
-            vec![DataType::UInt8, DataType::Float64],
-            vec![DataType::UInt16, DataType::Float64],
-            vec![DataType::UInt32, DataType::Float64],
-            vec![DataType::UInt64, DataType::Float64],
-            vec![DataType::Float32, DataType::Float64],
-            vec![DataType::Float64, DataType::Float64],
-        ];
-        for input_type in &input_types {
-            let signature = AggregateFunction::ApproxPercentileCont.signature();
-            let result = coerce_types(
-                &AggregateFunction::ApproxPercentileCont,
-                input_type,
-                &signature,
-            );
-            assert_eq!(*input_type, result.unwrap());
-        }
     }
 
     #[test]