synnada-ai · mertak-synnada · Feb 22, 2024 · Feb 22, 2024 · Feb 23, 2024 · Feb 26, 2024
diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs
@@ -524,22 +524,32 @@ pub fn aggregate_functional_dependencies(
         }
     }
 
-    // If we have a single GROUP BY key, we can guarantee uniqueness after
+    // When we have a GROUP BY key, we can guarantee uniqueness after
     // aggregation:
-    if group_by_expr_names.len() == 1 {
-        // If `source_indices` contain 0, delete this functional dependency
-        // as it will be added anyway with mode `Dependency::Single`:
-        aggregate_func_dependencies.retain(|item| !item.source_indices.contains(&0));
-        // Add a new functional dependency associated with the whole table:
-        aggregate_func_dependencies.push(
-            // Use nullable property of the group by expression
-            FunctionalDependence::new(
-                vec![0],
-                target_indices,
-                aggr_fields[0].is_nullable(),
-            )
-            .with_mode(Dependency::Single),
-        );
+    if !group_by_expr_names.is_empty() {
+        let count = group_by_expr_names.len();
+        let source_indices = (0..count).collect::<Vec<_>>();
+        let nullable = source_indices
+            .iter()
+            .any(|idx| aggr_fields[*idx].is_nullable());
+        // If GROUP BY expressions do not already act as a determinant:
+        if !aggregate_func_dependencies.iter().any(|item| {
+            // If `item.source_indices` is a subset of GROUP BY expressions, we shouldn't add
+            // them since `item.source_indices` defines this relation already.
+
+            // This simple count comparison is working well because
+            // GROUP BY statement comes here as a prefix
+            // It is guaranteed that group by indices would cover the range: [0..count]
+            item.source_indices.iter().all(|idx| idx < &count)
+        }) {
+            // Add a new functional dependency associated with the whole table:
+            // Use nullable property of the GROUP BY expression:
+            aggregate_func_dependencies.push(
+                // Use nullable property of the GROUP BY expression:
+                FunctionalDependence::new(source_indices, target_indices, nullable)
+                    .with_mode(Dependency::Single),
+            );
+        }
     }
     FunctionalDependencies::new(aggregate_func_dependencies)
 }

diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs
@@ -77,6 +77,20 @@ impl OptimizerRule for ReplaceDistinctWithAggregate {
         match plan {
             LogicalPlan::Distinct(Distinct::All(input)) => {
                 let group_expr = expand_wildcard(input.schema(), &input, None)?;
+
+                let fields = input.schema().fields();
+                let all_fields = (0..fields.len()).collect::<Vec<_>>();
+                let func_deps = input.schema().functional_dependencies().clone();
+
+                for func_dep in func_deps.iter() {
+                    // Means distinct is exactly same with below Group By
+                    // so delete the redundant distinct
+                    if func_dep.source_indices == all_fields {
+                        return Ok(Transformed::yes(input.as_ref().clone()));
+                    }
+                }
+
+                // Replace with Aggregation
                 let aggr_plan = LogicalPlan::Aggregate(Aggregate::try_new(
                     input,
                     group_expr,
@@ -165,3 +179,76 @@ impl OptimizerRule for ReplaceDistinctWithAggregate {
         Some(BottomUp)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::replace_distinct_aggregate::ReplaceDistinctWithAggregate;
+    use crate::test::*;
+    use datafusion_common::Result;
+    use datafusion_expr::{
+        col, logical_plan::builder::LogicalPlanBuilder, Expr, LogicalPlan,
+    };
+    use datafusion_functions_aggregate::sum::sum;
+    use std::sync::Arc;
+
+    fn assert_optimized_plan_equal(plan: &LogicalPlan, expected: &str) -> Result<()> {
+        assert_optimized_plan_eq(
+            Arc::new(ReplaceDistinctWithAggregate::new()),
+            plan.clone(),
+            expected,
+        )
+    }
+
+    #[test]
+    fn eliminate_redundant_distinct_simple() -> Result<()> {
+        let table_scan = test_table_scan().unwrap();
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("c")], Vec::<Expr>::new())?
+            .project(vec![col("c")])?
+            .distinct()?
+            .build()?;
+
+        let expected = "Projection: test.c\n  Aggregate: groupBy=[[test.c]], aggr=[[]]\n    TableScan: test";
+        assert_optimized_plan_equal(&plan, expected)
+    }
+
+    #[test]
+    fn eliminate_redundant_distinct_pair() -> Result<()> {
+        let table_scan = test_table_scan().unwrap();
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a"), col("b")], Vec::<Expr>::new())?
+            .project(vec![col("a"), col("b")])?
+            .distinct()?
+            .build()?;
+
+        let expected =
+            "Projection: test.a, test.b\n  Aggregate: groupBy=[[test.a, test.b]], aggr=[[]]\n    TableScan: test";
+        assert_optimized_plan_equal(&plan, expected)
+    }
+
+    #[test]
+    fn do_not_eliminate_distinct() -> Result<()> {
+        let table_scan = test_table_scan().unwrap();
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a"), col("b")])?
+            .distinct()?
+            .build()?;
+
+        let expected = "Aggregate: groupBy=[[test.a, test.b]], aggr=[[]]\n  Projection: test.a, test.b\n    TableScan: test";
+        assert_optimized_plan_equal(&plan, expected)
+    }
+
+    #[test]
+    fn do_not_eliminate_distinct_with_aggr() -> Result<()> {
+        let table_scan = test_table_scan().unwrap();
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a"), col("b"), col("c")], vec![sum(col("c"))])?
+            .project(vec![col("a"), col("b")])?
+            .distinct()?
+            .build()?;
+
+        let expected =
+            "Aggregate: groupBy=[[test.a, test.b]], aggr=[[]]\n  Projection: test.a, test.b\n    Aggregate: groupBy=[[test.a, test.b, test.c]], aggr=[[sum(test.c)]]\n      TableScan: test";
+        assert_optimized_plan_equal(&plan, expected)
+    }
+}
diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs
@@ -40,7 +40,7 @@ use hashbrown::HashSet;
 /// single distinct to group by optimizer rule
 ///  ```text
 ///    Before:
-///    SELECT a, count(DINSTINCT b), sum(c)
+///    SELECT a, count(DISTINCT b), sum(c)
 ///    FROM t
 ///    GROUP BY a
 ///