From 9e3705c7270d3178041e4a29154431eda97ecfee Mon Sep 17 00:00:00 2001
From: Mithun RK <mithunr@nvidia.com>
Date: Mon, 15 Jun 2020 16:49:54 -0700
Subject: [PATCH 1/4] Remove INCOMPAT for NormalizeNanAndZero,
 KnownFloatingPointNormalized

---
 .../src/main/scala/ai/rapids/spark/GpuOverrides.scala       | 6 ++----
 .../test/scala/ai/rapids/spark/HashAggregatesSuite.scala    | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/sql-plugin/src/main/scala/ai/rapids/spark/GpuOverrides.scala b/sql-plugin/src/main/scala/ai/rapids/spark/GpuOverrides.scala
index 9684b7ed2c1..e0567b213f1 100644
--- a/sql-plugin/src/main/scala/ai/rapids/spark/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/ai/rapids/spark/GpuOverrides.scala
@@ -867,15 +867,13 @@ object GpuOverrides {
       (a, conf, p, r) => new UnaryExprMeta[NormalizeNaNAndZero](a, conf, p, r) {
         override def convertToGpu(child: GpuExpression): GpuExpression =
           GpuNormalizeNaNAndZero(child)
-      })
-      .incompat(FLOAT_DIFFERS_GROUP_INCOMPAT),
+      }),
     expr[KnownFloatingPointNormalized](
       "tag to prevent redundant normalization",
       (a, conf, p, r) => new UnaryExprMeta[KnownFloatingPointNormalized](a, conf, p, r) {
         override def convertToGpu(child: GpuExpression): GpuExpression =
           GpuKnownFloatingPointNormalized(child)
-      })
-      .incompat(FLOAT_DIFFERS_GROUP_INCOMPAT),
+      }),
     expr[DateDiff]("datediff", (a, conf, p, r) =>
       new BinaryExprMeta[DateDiff](a, conf, p, r) {
         override def convertToGpu(lhs: GpuExpression, rhs: GpuExpression): GpuExpression = {
diff --git a/tests/src/test/scala/ai/rapids/spark/HashAggregatesSuite.scala b/tests/src/test/scala/ai/rapids/spark/HashAggregatesSuite.scala
index 1fc8e443280..f68ffc1f4ec 100644
--- a/tests/src/test/scala/ai/rapids/spark/HashAggregatesSuite.scala
+++ b/tests/src/test/scala/ai/rapids/spark/HashAggregatesSuite.scala
@@ -1691,8 +1691,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
     floatWithDifferentKindsOfNansAndZeros,
     conf = new SparkConf()
       .set(RapidsConf.HAS_NANS.key, "false")
-      .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")
-      .set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) {
+      .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) {
     frame => frame.groupBy(col("float")).agg(sum(col("int")))
   }
 
@@ -1701,8 +1700,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
     doubleWithDifferentKindsOfNansAndZeros,
     conf = new SparkConf()
       .set(RapidsConf.HAS_NANS.key, "false")
-      .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")
-      .set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) {
+      .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) {
     frame => frame.groupBy(col("double")).agg(sum(col("int")))
   }
 }

From 11bd233e6a543bfbd654c8b09777429bbfc5bb5f Mon Sep 17 00:00:00 2001
From: Mithun RK <mithunr@nvidia.com>
Date: Tue, 16 Jun 2020 10:50:18 -0700
Subject: [PATCH 2/4] Remove INCOMPAT for NormalizeNanAndZero,
 KnownFloatingPointNormalized

(Removed incompat in Python integration test
for HashAggregateExec, where obvious.)
---
 docs/configs.md                                          | 4 ++--
 integration_tests/src/main/python/hash_aggregate_test.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/configs.md b/docs/configs.md
index cecbbb63cb8..fd27660ab7b 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -133,7 +133,7 @@ Name | Description | Default Value | Incompatibilities
 <a name="sql.expression.IsNaN"></a>spark.rapids.sql.expression.IsNaN|checks if a value is NaN|true|None|
 <a name="sql.expression.IsNotNull"></a>spark.rapids.sql.expression.IsNotNull|checks if a value is not null|true|None|
 <a name="sql.expression.IsNull"></a>spark.rapids.sql.expression.IsNull|checks if a value is null|true|None|
-<a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)|
+<a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|true|None|
 <a name="sql.expression.Length"></a>spark.rapids.sql.expression.Length|String Character Length|true|None|
 <a name="sql.expression.LessThan"></a>spark.rapids.sql.expression.LessThan|< operator|true|None|
 <a name="sql.expression.LessThanOrEqual"></a>spark.rapids.sql.expression.LessThanOrEqual|<= operator|true|None|
@@ -202,7 +202,7 @@ Name | Description | Default Value | Incompatibilities
 <a name="sql.expression.Max"></a>spark.rapids.sql.expression.Max|max aggregate operator|true|None|
 <a name="sql.expression.Min"></a>spark.rapids.sql.expression.Min|min aggregate operator|true|None|
 <a name="sql.expression.Sum"></a>spark.rapids.sql.expression.Sum|sum aggregate operator|true|None|
-<a name="sql.expression.NormalizeNaNAndZero"></a>spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)|
+<a name="sql.expression.NormalizeNaNAndZero"></a>spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|true|None|
 
 ### Execution
 Name | Description | Default Value | Incompatibilities
diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py
index 3060ff4ca0d..1f19817976c 100644
--- a/integration_tests/src/main/python/hash_aggregate_test.py
+++ b/integration_tests/src/main/python/hash_aggregate_test.py
@@ -317,7 +317,6 @@ def test_hash_query_max_bug(data_gen):
 
 
 @ignore_order
-@incompat
 @pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nan_zero_grouping_keys,
                                       _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
 def test_hash_agg_with_nan_keys(data_gen):

From 0a241a8f891f88095662cfdfeee8f1a8c248ea4a Mon Sep 17 00:00:00 2001
From: Mithun RK <mithunr@nvidia.com>
Date: Tue, 16 Jun 2020 13:48:27 -0700
Subject: [PATCH 3/4] Remove INCOMPAT for NormalizeNanAndZero,
 KnownFloatingPointNormalized

Removed obviated check for NaNs from GpuHashAggregateMeta
---
 sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala b/sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala
index a42310580f7..0e9e5d14e44 100644
--- a/sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala
+++ b/sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala
@@ -70,13 +70,6 @@ class GpuHashAggregateMeta(
       }
     }
     val groupingExpressionTypes = agg.groupingExpressions.map(_.dataType)
-    if (conf.hasNans &&
-      (groupingExpressionTypes.contains(FloatType) ||
-        groupingExpressionTypes.contains(DoubleType))) {
-      willNotWorkOnGpu("grouping expressions and some aggregations over floating point columns " +
-        "that may contain -0.0 and NaN are disabled. You can bypass this by setting " +
-        s"${RapidsConf.HAS_NANS}=false")
-    }
     if (agg.resultExpressions.isEmpty) {
       willNotWorkOnGpu("result expressions is empty")
     }

From f427e5f57fc0862bf735a5061223a2b00808f110 Mon Sep 17 00:00:00 2001
From: Mithun RK <mithunr@nvidia.com>
Date: Tue, 16 Jun 2020 16:03:12 -0700
Subject: [PATCH 4/4] Remove INCOMPAT for NormalizeNanAndZero,
 KnownFloatingPointNormalized

Added more aggregation functions to groupby tests.
---
 .../src/main/python/hash_aggregate_test.py    | 29 +++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py
index 1f19817976c..eb306a81ee3 100644
--- a/integration_tests/src/main/python/hash_aggregate_test.py
+++ b/integration_tests/src/main/python/hash_aggregate_test.py
@@ -316,16 +316,41 @@ def test_hash_query_max_bug(data_gen):
         lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.max('b')))
 
 
+@approximate_float
 @ignore_order
 @pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nan_zero_grouping_keys,
                                       _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
 def test_hash_agg_with_nan_keys(data_gen):
     df = with_cpu_session(
-        lambda spark : gen_df(spark, data_gen, length=100))
+        lambda spark : gen_df(spark, data_gen, length=1024))
+    df.createOrReplaceTempView("hash_agg_table")
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: spark.sql(
+            'select a, '
+                   'count(*) as count_stars, ' 
+                   'count(b) as count_bees, '
+                   'sum(b) as sum_of_bees, '
+                   'max(c) as max_seas, '
+                   'min(c) as min_seas, '
+                   'count(distinct c) as count_distinct_cees, '
+                   'avg(c) as average_seas '
+            'from hash_agg_table group by a'),
+        conf=_no_nans_float_conf)
+
+
+@pytest.mark.xfail(reason="count(distinct floats) fails when there are NaN values in the aggregation column."
+                          "(https://github.com/NVIDIA/spark-rapids/issues/194)")
+@approximate_float
+@ignore_order
+@pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
+def test_count_distinct_with_nan_floats(data_gen):
+    df = with_cpu_session(
+        lambda spark : gen_df(spark, data_gen, length=1024))
     df.createOrReplaceTempView("hash_agg_table")
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: spark.sql(
-            'select a, count(*) as cnt, sum(b) as sum_b, max(c) as max_c '
+            'select a, '
+                   'count(distinct b) as count_distinct_bees '
             'from hash_agg_table group by a'),
         conf=_no_nans_float_conf)