Skip to content

Commit

Permalink
Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized (
Browse files Browse the repository at this point in the history
  • Loading branch information
mythrocks authored Jun 18, 2020
1 parent 561b722 commit e0209e8
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 20 deletions.
4 changes: 2 additions & 2 deletions docs/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ Name | Description | Default Value | Incompatibilities
<a name="sql.expression.IsNaN"></a>spark.rapids.sql.expression.IsNaN|checks if a value is NaN|true|None|
<a name="sql.expression.IsNotNull"></a>spark.rapids.sql.expression.IsNotNull|checks if a value is not null|true|None|
<a name="sql.expression.IsNull"></a>spark.rapids.sql.expression.IsNull|checks if a value is null|true|None|
<a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)|
<a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized|tag to prevent redundant normalization|true|None|
<a name="sql.expression.Length"></a>spark.rapids.sql.expression.Length|String Character Length|true|None|
<a name="sql.expression.LessThan"></a>spark.rapids.sql.expression.LessThan|< operator|true|None|
<a name="sql.expression.LessThanOrEqual"></a>spark.rapids.sql.expression.LessThanOrEqual|<= operator|true|None|
Expand Down Expand Up @@ -202,7 +202,7 @@ Name | Description | Default Value | Incompatibilities
<a name="sql.expression.Max"></a>spark.rapids.sql.expression.Max|max aggregate operator|true|None|
<a name="sql.expression.Min"></a>spark.rapids.sql.expression.Min|min aggregate operator|true|None|
<a name="sql.expression.Sum"></a>spark.rapids.sql.expression.Sum|sum aggregate operator|true|None|
<a name="sql.expression.NormalizeNaNAndZero"></a>spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|false|This is not 100% compatible with the Spark version because when enabling these, there may be extra groups produced for floating point grouping keys (e.g. -0.0, and 0.0)|
<a name="sql.expression.NormalizeNaNAndZero"></a>spark.rapids.sql.expression.NormalizeNaNAndZero|normalize nan and zero|true|None|

### Execution
Name | Description | Default Value | Incompatibilities
Expand Down
30 changes: 27 additions & 3 deletions integration_tests/src/main/python/hash_aggregate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,17 +316,41 @@ def test_hash_query_max_bug(data_gen):
lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.max('b')))


@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nan_zero_grouping_keys,
_grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
def test_hash_agg_with_nan_keys(data_gen):
df = with_cpu_session(
lambda spark : gen_df(spark, data_gen, length=100))
lambda spark : gen_df(spark, data_gen, length=1024))
df.createOrReplaceTempView("hash_agg_table")
assert_gpu_and_cpu_are_equal_collect(
lambda spark: spark.sql(
'select a, '
'count(*) as count_stars, '
'count(b) as count_bees, '
'sum(b) as sum_of_bees, '
'max(c) as max_seas, '
'min(c) as min_seas, '
'count(distinct c) as count_distinct_cees, '
'avg(c) as average_seas '
'from hash_agg_table group by a'),
conf=_no_nans_float_conf)


@pytest.mark.xfail(reason="count(distinct floats) fails when there are NaN values in the aggregation column."
"(https://github.com/NVIDIA/spark-rapids/issues/194)")
@approximate_float
@ignore_order
@pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
def test_count_distinct_with_nan_floats(data_gen):
df = with_cpu_session(
lambda spark : gen_df(spark, data_gen, length=1024))
df.createOrReplaceTempView("hash_agg_table")
assert_gpu_and_cpu_are_equal_collect(
lambda spark: spark.sql(
'select a, count(*) as cnt, sum(b) as sum_b, max(c) as max_c '
'select a, '
'count(distinct b) as count_distinct_bees '
'from hash_agg_table group by a'),
conf=_no_nans_float_conf)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -868,15 +868,13 @@ object GpuOverrides {
(a, conf, p, r) => new UnaryExprMeta[NormalizeNaNAndZero](a, conf, p, r) {
override def convertToGpu(child: GpuExpression): GpuExpression =
GpuNormalizeNaNAndZero(child)
})
.incompat(FLOAT_DIFFERS_GROUP_INCOMPAT),
}),
expr[KnownFloatingPointNormalized](
"tag to prevent redundant normalization",
(a, conf, p, r) => new UnaryExprMeta[KnownFloatingPointNormalized](a, conf, p, r) {
override def convertToGpu(child: GpuExpression): GpuExpression =
GpuKnownFloatingPointNormalized(child)
})
.incompat(FLOAT_DIFFERS_GROUP_INCOMPAT),
}),
expr[DateDiff]("datediff", (a, conf, p, r) =>
new BinaryExprMeta[DateDiff](a, conf, p, r) {
override def convertToGpu(lhs: GpuExpression, rhs: GpuExpression): GpuExpression = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,6 @@ class GpuHashAggregateMeta(
}
}
val groupingExpressionTypes = agg.groupingExpressions.map(_.dataType)
if (conf.hasNans &&
(groupingExpressionTypes.contains(FloatType) ||
groupingExpressionTypes.contains(DoubleType))) {
willNotWorkOnGpu("grouping expressions and some aggregations over floating point columns " +
"that may contain -0.0 and NaN are disabled. You can bypass this by setting " +
s"${RapidsConf.HAS_NANS}=false")
}
if (agg.resultExpressions.isEmpty) {
willNotWorkOnGpu("result expressions is empty")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1691,8 +1691,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
floatWithDifferentKindsOfNansAndZeros,
conf = new SparkConf()
.set(RapidsConf.HAS_NANS.key, "false")
.set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")
.set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) {
.set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) {
frame => frame.groupBy(col("float")).agg(sum(col("int")))
}

Expand All @@ -1701,8 +1700,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
doubleWithDifferentKindsOfNansAndZeros,
conf = new SparkConf()
.set(RapidsConf.HAS_NANS.key, "false")
.set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")
.set(RapidsConf.INCOMPATIBLE_OPS.key, "true")) {
.set(RapidsConf.ENABLE_FLOAT_AGG.key, "true")) {
frame => frame.groupBy(col("double")).agg(sum(col("int")))
}
}

0 comments on commit e0209e8

Please sign in to comment.