diff --git a/docs/configs.md b/docs/configs.md
index 3e93ca97a46..7d04336b8f5 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -133,7 +133,7 @@ Name | Description | Default Value | Incompatibilities
spark.rapids.sql.expression.IsNaN|checks if a value is NaN|true|None|
spark.rapids.sql.expression.IsNotNull|checks if a value is not null|true|None|
spark.rapids.sql.expression.IsNull|checks if a value is null|true|None|
-spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)|
+spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|true|None|
spark.rapids.sql.expression.Length|String Character Length|true|None|
spark.rapids.sql.expression.LessThan|< operator|true|None|
spark.rapids.sql.expression.LessThanOrEqual|<= operator|true|None|
@@ -202,7 +202,7 @@ Name | Description | Default Value | Incompatibilities
spark.rapids.sql.expression.Max|max aggregate operator|true|None|
spark.rapids.sql.expression.Min|min aggregate operator|true|None|
spark.rapids.sql.expression.Sum|sum aggregate operator|true|None|
-spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)|
+spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|true|None|
### Execution
Name | Description | Default Value | Incompatibilities
diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py
index 3060ff4ca0d..eb306a81ee3 100644
--- a/integration_tests/src/main/python/hash_aggregate_test.py
+++ b/integration_tests/src/main/python/hash_aggregate_test.py
@@ -316,17 +316,41 @@ def test_hash_query_max_bug(data_gen):
lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.max('b')))
+@approximate_float
@ignore_order
-@incompat
@pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nan_zero_grouping_keys,
_grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
def test_hash_agg_with_nan_keys(data_gen):
df = with_cpu_session(
- lambda spark : gen_df(spark, data_gen, length=100))
+ lambda spark : gen_df(spark, data_gen, length=1024))
+ df.createOrReplaceTempView("hash_agg_table")
+ assert_gpu_and_cpu_are_equal_collect(
+ lambda spark: spark.sql(
+ 'select a, '
+ 'count(*) as count_stars, '
+ 'count(b) as count_bees, '
+ 'sum(b) as sum_of_bees, '
+ 'max(c) as max_seas, '
+ 'min(c) as min_seas, '
+ 'count(distinct c) as count_distinct_cees, '
+ 'avg(c) as average_seas '
+ 'from hash_agg_table group by a'),
+ conf=_no_nans_float_conf)
+
+
+@pytest.mark.xfail(reason="count(distinct floats) fails when there are NaN values in the aggregation column."
+ "(https://github.com/NVIDIA/spark-rapids/issues/194)")
+@approximate_float
+@ignore_order
+@pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
+def test_count_distinct_with_nan_floats(data_gen):
+ df = with_cpu_session(
+ lambda spark : gen_df(spark, data_gen, length=1024))
df.createOrReplaceTempView("hash_agg_table")
assert_gpu_and_cpu_are_equal_collect(
lambda spark: spark.sql(
- 'select a, count(*) as cnt, sum(b) as sum_b, max(c) as max_c '
+ 'select a, '
+ 'count(distinct b) as count_distinct_bees '
'from hash_agg_table group by a'),
conf=_no_nans_float_conf)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
index 17d9b3074bf..fda1230cbec 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -868,15 +868,13 @@ object GpuOverrides {
(a, conf, p, r) => new UnaryExprMeta[NormalizeNaNAndZero](a, conf, p, r) {
override def convertToGpu(child: GpuExpression): GpuExpression =
GpuNormalizeNaNAndZero(child)
- })
- .incompat(FLOAT_DIFFERS_GROUP_INCOMPAT),
+ }),
expr[KnownFloatingPointNormalized](
"tag to prevent redundant normalization",
(a, conf, p, r) => new UnaryExprMeta[KnownFloatingPointNormalized](a, conf, p, r) {
override def convertToGpu(child: GpuExpression): GpuExpression =
GpuKnownFloatingPointNormalized(child)
- })
- .incompat(FLOAT_DIFFERS_GROUP_INCOMPAT),
+ }),
expr[DateDiff]("datediff", (a, conf, p, r) =>
new BinaryExprMeta[DateDiff](a, conf, p, r) {
override def convertToGpu(lhs: GpuExpression, rhs: GpuExpression): GpuExpression = {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala
index ade6f39f1db..c350e866d3b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala
@@ -70,13 +70,6 @@ class GpuHashAggregateMeta(
}
}
val groupingExpressionTypes = agg.groupingExpressions.map(_.dataType)
- if (conf.hasNans &&
- (groupingExpressionTypes.contains(FloatType) ||
- groupingExpressionTypes.contains(DoubleType))) {
- willNotWorkOnGpu("grouping expressions and some aggregations over floating point columns " +
- "that may contain -0.0 and NaN are disabled. You can bypass this by setting " +
- s"${RapidsConf.HAS_NANS}=false")
- }
if (agg.resultExpressions.isEmpty) {
willNotWorkOnGpu("result expressions is empty")
}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala
index 1ee5e7acef2..955fb745d7f 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala
@@ -1691,8 +1691,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
floatWithDifferentKindsOfNansAndZeros,
conf = new SparkConf()
.set(RapidsConf.HAS_NANS.key, "false")
- .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")
- .set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) {
+ .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) {
frame => frame.groupBy(col("float")).agg(sum(col("int")))
}
@@ -1701,8 +1700,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
doubleWithDifferentKindsOfNansAndZeros,
conf = new SparkConf()
.set(RapidsConf.HAS_NANS.key, "false")
- .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")
- .set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) {
+ .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) {
frame => frame.groupBy(col("double")).agg(sum(col("int")))
}
}