From 9e3705c7270d3178041e4a29154431eda97ecfee Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 15 Jun 2020 16:49:54 -0700 Subject: [PATCH 1/4] Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized --- .../src/main/scala/ai/rapids/spark/GpuOverrides.scala | 6 ++---- .../test/scala/ai/rapids/spark/HashAggregatesSuite.scala | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/sql-plugin/src/main/scala/ai/rapids/spark/GpuOverrides.scala b/sql-plugin/src/main/scala/ai/rapids/spark/GpuOverrides.scala index 9684b7ed2c1..e0567b213f1 100644 --- a/sql-plugin/src/main/scala/ai/rapids/spark/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/ai/rapids/spark/GpuOverrides.scala @@ -867,15 +867,13 @@ object GpuOverrides { (a, conf, p, r) => new UnaryExprMeta[NormalizeNaNAndZero](a, conf, p, r) { override def convertToGpu(child: GpuExpression): GpuExpression = GpuNormalizeNaNAndZero(child) - }) - .incompat(FLOAT_DIFFERS_GROUP_INCOMPAT), + }), expr[KnownFloatingPointNormalized]( "tag to prevent redundant normalization", (a, conf, p, r) => new UnaryExprMeta[KnownFloatingPointNormalized](a, conf, p, r) { override def convertToGpu(child: GpuExpression): GpuExpression = GpuKnownFloatingPointNormalized(child) - }) - .incompat(FLOAT_DIFFERS_GROUP_INCOMPAT), + }), expr[DateDiff]("datediff", (a, conf, p, r) => new BinaryExprMeta[DateDiff](a, conf, p, r) { override def convertToGpu(lhs: GpuExpression, rhs: GpuExpression): GpuExpression = { diff --git a/tests/src/test/scala/ai/rapids/spark/HashAggregatesSuite.scala b/tests/src/test/scala/ai/rapids/spark/HashAggregatesSuite.scala index 1fc8e443280..f68ffc1f4ec 100644 --- a/tests/src/test/scala/ai/rapids/spark/HashAggregatesSuite.scala +++ b/tests/src/test/scala/ai/rapids/spark/HashAggregatesSuite.scala @@ -1691,8 +1691,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite { floatWithDifferentKindsOfNansAndZeros, conf = new SparkConf() .set(RapidsConf.HAS_NANS.key, "false") - .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true") - .set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) { + .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) { frame => frame.groupBy(col("float")).agg(sum(col("int"))) } @@ -1701,8 +1700,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite { doubleWithDifferentKindsOfNansAndZeros, conf = new SparkConf() .set(RapidsConf.HAS_NANS.key, "false") - .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true") - .set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) { + .set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) { frame => frame.groupBy(col("double")).agg(sum(col("int"))) } } From 11bd233e6a543bfbd654c8b09777429bbfc5bb5f Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Tue, 16 Jun 2020 10:50:18 -0700 Subject: [PATCH 2/4] Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized (Removed incompat in Python integration test for HashAggregateExec, where obvious.) --- docs/configs.md | 4 ++-- integration_tests/src/main/python/hash_aggregate_test.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index cecbbb63cb8..fd27660ab7b 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -133,7 +133,7 @@ Name | Description | Default Value | Incompatibilities spark.rapids.sql.expression.IsNaN|checks if a value is NaN|true|None| spark.rapids.sql.expression.IsNotNull|checks if a value is not null|true|None| spark.rapids.sql.expression.IsNull|checks if a value is null|true|None| -spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)| +spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|true|None| spark.rapids.sql.expression.Length|String Character Length|true|None| spark.rapids.sql.expression.LessThan|< operator|true|None| spark.rapids.sql.expression.LessThanOrEqual|<= operator|true|None| @@ -202,7 +202,7 @@ Name | Description | Default Value | Incompatibilities spark.rapids.sql.expression.Max|max aggregate operator|true|None| spark.rapids.sql.expression.Min|min aggregate operator|true|None| spark.rapids.sql.expression.Sum|sum aggregate operator|true|None| -spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)| +spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|true|None| ### Execution Name | Description | Default Value | Incompatibilities diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 3060ff4ca0d..1f19817976c 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -317,7 +317,6 @@ def test_hash_query_max_bug(data_gen): @ignore_order -@incompat @pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nan_zero_grouping_keys, _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) def test_hash_agg_with_nan_keys(data_gen): From 0a241a8f891f88095662cfdfeee8f1a8c248ea4a Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Tue, 16 Jun 2020 13:48:27 -0700 Subject: [PATCH 3/4] Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized Removed obviated check for NaNs from GpuHashAggregateMeta --- sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala b/sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala index a42310580f7..0e9e5d14e44 100644 --- a/sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala +++ b/sql-plugin/src/main/scala/ai/rapids/spark/aggregate.scala @@ -70,13 +70,6 @@ class GpuHashAggregateMeta( } } val groupingExpressionTypes = agg.groupingExpressions.map(_.dataType) - if (conf.hasNans && - (groupingExpressionTypes.contains(FloatType) || - groupingExpressionTypes.contains(DoubleType))) { - willNotWorkOnGpu("grouping expressions and some aggregations over floating point columns " + - "that may contain -0.0 and NaN are disabled. You can bypass this by setting " + - s"${RapidsConf.HAS_NANS}=false") - } if (agg.resultExpressions.isEmpty) { willNotWorkOnGpu("result expressions is empty") } From f427e5f57fc0862bf735a5061223a2b00808f110 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Tue, 16 Jun 2020 16:03:12 -0700 Subject: [PATCH 4/4] Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized Added more aggregation functions to groupby tests. --- .../src/main/python/hash_aggregate_test.py | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 1f19817976c..eb306a81ee3 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -316,16 +316,41 @@ def test_hash_query_max_bug(data_gen): lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.max('b'))) +@approximate_float @ignore_order @pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nan_zero_grouping_keys, _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) def test_hash_agg_with_nan_keys(data_gen): df = with_cpu_session( - lambda spark : gen_df(spark, data_gen, length=100)) + lambda spark : gen_df(spark, data_gen, length=1024)) + df.createOrReplaceTempView("hash_agg_table") + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.sql( + 'select a, ' + 'count(*) as count_stars, ' + 'count(b) as count_bees, ' + 'sum(b) as sum_of_bees, ' + 'max(c) as max_seas, ' + 'min(c) as min_seas, ' + 'count(distinct c) as count_distinct_cees, ' + 'avg(c) as average_seas ' + 'from hash_agg_table group by a'), + conf=_no_nans_float_conf) + + +@pytest.mark.xfail(reason="count(distinct floats) fails when there are NaN values in the aggregation column." + "(https://github.com/NVIDIA/spark-rapids/issues/194)") +@approximate_float +@ignore_order +@pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) +def test_count_distinct_with_nan_floats(data_gen): + df = with_cpu_session( + lambda spark : gen_df(spark, data_gen, length=1024)) df.createOrReplaceTempView("hash_agg_table") assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.sql( - 'select a, count(*) as cnt, sum(b) as sum_b, max(c) as max_c ' + 'select a, ' + 'count(distinct b) as count_distinct_bees ' 'from hash_agg_table group by a'), conf=_no_nans_float_conf)