From 464264ca907ef25296a00a953f38de3c3403c9c5 Mon Sep 17 00:00:00 2001
From: mythrocks <mythrocks@gmail.com>
Date: Wed, 24 Jun 2020 10:41:54 -0700
Subject: [PATCH] Add conditional xfail test for DISTINCT aggregates with NaN
 (#261)

SPARK-32038 reports a regression in Apache Spark (3.0.0), in
failing to normalize NaN/Zero float values, during DISTINCT
aggregations. This causes a mismatch in results between
Apache Spark 3.0.0 on CPU, and the Rapids Accelerator (which
returns the right results).
SPARK-32038 was fixed in apache/spark#28876.

This commit introduces a conditional xfail test that passes
on Apache Spark 3.0.1 and 3.1+ (which fixes SPARK-32038),
but produces an expected failure on Spark 3.0.0.
---
 .../src/main/python/hash_aggregate_test.py             | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py
index eb306a81ee3..60f7212b335 100644
--- a/integration_tests/src/main/python/hash_aggregate_test.py
+++ b/integration_tests/src/main/python/hash_aggregate_test.py
@@ -19,7 +19,7 @@
 from pyspark.sql.types import *
 from marks import *
 import pyspark.sql.functions as f
-from spark_session import with_cpu_session
+from spark_session import with_cpu_session, with_spark_session
 
 _no_nans_float_conf = {'spark.rapids.sql.variableFloatAgg.enabled': 'true',
                        'spark.rapids.sql.hasNans': 'false',
@@ -338,8 +338,11 @@ def test_hash_agg_with_nan_keys(data_gen):
         conf=_no_nans_float_conf)
 
 
-@pytest.mark.xfail(reason="count(distinct floats) fails when there are NaN values in the aggregation column."
-                          "(https://github.com/NVIDIA/spark-rapids/issues/194)")
+@pytest.mark.xfail(
+    condition=with_spark_session(lambda spark : spark.sparkContext.version == "3.0.0"),
+    reason="[SPARK-32038][SQL] NormalizeFloatingNumbers should also work on distinct aggregate "
+           "(https://github.com/apache/spark/pull/28876) "
+           "Fixed in later Apache Spark releases.")
 @approximate_float
 @ignore_order
 @pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
@@ -354,6 +357,5 @@ def test_count_distinct_with_nan_floats(data_gen):
             'from hash_agg_table group by a'),
         conf=_no_nans_float_conf)
 
-
 # TODO: Literal tests
 # TODO: First and Last tests