From e0209e84d8f2f1c111660aedf857a015c3fda54d Mon Sep 17 00:00:00 2001 From: mythrocks Date: Thu, 18 Jun 2020 06:59:18 -0700 Subject: [PATCH] Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized (#181) --- docs/configs.md | 4 +-- .../src/main/python/hash_aggregate_test.py | 30 +++++++++++++++++-- .../nvidia/spark/rapids/GpuOverrides.scala | 6 ++-- .../com/nvidia/spark/rapids/aggregate.scala | 7 ----- .../spark/rapids/HashAggregatesSuite.scala | 6 ++-- 5 files changed, 33 insertions(+), 20 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 3e93ca97a46..7d04336b8f5 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -133,7 +133,7 @@ Name | Description | Default Value | Incompatibilities spark.rapids.sql.expression.IsNaN|checks if a value is NaN|true|None| spark.rapids.sql.expression.IsNotNull|checks if a value is not null|true|None| spark.rapids.sql.expression.IsNull|checks if a value is null|true|None| -spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)| +spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|true|None| spark.rapids.sql.expression.Length|String Character Length|true|None| spark.rapids.sql.expression.LessThan|< operator|true|None| spark.rapids.sql.expression.LessThanOrEqual|<= operator|true|None| @@ -202,7 +202,7 @@ Name | Description | Default Value | Incompatibilities spark.rapids.sql.expression.Max|max aggregate operator|true|None| spark.rapids.sql.expression.Min|min aggregate operator|true|None| spark.rapids.sql.expression.Sum|sum aggregate operator|true|None| -spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)| +spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|true|None| ### Execution Name | Description | Default Value | Incompatibilities diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 3060ff4ca0d..eb306a81ee3 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -316,17 +316,41 @@ def test_hash_query_max_bug(data_gen): lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.max('b'))) +@approximate_float @ignore_order -@incompat @pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nan_zero_grouping_keys, _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) def test_hash_agg_with_nan_keys(data_gen): df = with_cpu_session( - lambda spark : gen_df(spark, data_gen, length=100)) + lambda spark : gen_df(spark, data_gen, length=1024)) + df.createOrReplaceTempView("hash_agg_table") + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.sql( + 'select a, ' + 'count(*) as count_stars, ' + 'count(b) as count_bees, ' + 'sum(b) as sum_of_bees, ' + 'max(c) as max_seas, ' + 'min(c) as min_seas, ' + 'count(distinct c) as count_distinct_cees, ' + 'avg(c) as average_seas ' + 'from hash_agg_table group by a'), + conf=_no_nans_float_conf) + + +@pytest.mark.xfail(reason="count(distinct floats) fails when there are NaN values in the aggregation column." + "(https://github.com/NVIDIA/spark-rapids/issues/194)") +@approximate_float +@ignore_order +@pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) +def test_count_distinct_with_nan_floats(data_gen): + df = with_cpu_session( + lambda spark : gen_df(spark, data_gen, length=1024)) df.createOrReplaceTempView("hash_agg_table") assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.sql( - 'select a, count(*) as cnt, sum(b) as sum_b, max(c) as max_c ' + 'select a, ' + 'count(distinct b) as count_distinct_bees ' 'from hash_agg_table group by a'), conf=_no_nans_float_conf) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 17d9b3074bf..fda1230cbec 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -868,15 +868,13 @@ object GpuOverrides { (a, conf, p, r) => new UnaryExprMeta[NormalizeNaNAndZero](a, conf, p, r) { override def convertToGpu(child: GpuExpression): GpuExpression = GpuNormalizeNaNAndZero(child) - }) - .incompat(FLOAT_DIFFERS_GROUP_INCOMPAT), + }), expr[KnownFloatingPointNormalized]( "tag to prevent redundant normalization", (a, conf, p, r) => new UnaryExprMeta[KnownFloatingPointNormalized](a, conf, p, r) { override def convertToGpu(child: GpuExpression): GpuExpression = GpuKnownFloatingPointNormalized(child) - }) - .incompat(FLOAT_DIFFERS_GROUP_INCOMPAT), + }), expr[DateDiff]("datediff", (a, conf, p, r) => new BinaryExprMeta[DateDiff](a, conf, p, r) { override def convertToGpu(lhs: GpuExpression, rhs: GpuExpression): GpuExpression = { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala index ade6f39f1db..c350e866d3b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala @@ -70,13 +70,6 @@ class GpuHashAggregateMeta( } } val groupingExpressionTypes = agg.groupingExpressions.map(_.dataType) - if (conf.hasNans && - (groupingExpressionTypes.contains(FloatType) || - groupingExpressionTypes.contains(DoubleType))) { - willNotWorkOnGpu("grouping expressions and some aggregations over floating point columns " + - "that may contain -0.0 and NaN are disabled. You can bypass this by setting " + - s"${RapidsConf.HAS_NANS}=false") - } if (agg.resultExpressions.isEmpty) { willNotWorkOnGpu("result expressions is empty") } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala index 1ee5e7acef2..955fb745d7f 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala @@ -1691,8 +1691,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite { floatWithDifferentKindsOfNansAndZeros, conf = new SparkConf() .set(RapidsConf.HAS_NANS.key, "false") - .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true") - .set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) { + .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) { frame => frame.groupBy(col("float")).agg(sum(col("int"))) } @@ -1701,8 +1700,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite { doubleWithDifferentKindsOfNansAndZeros, conf = new SparkConf() .set(RapidsConf.HAS_NANS.key, "false") - .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true") - .set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) { + .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) { frame => frame.groupBy(col("double")).agg(sum(col("int"))) } }