From 5523e50e0997be968fd3325545d7fcbd8b7234a1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Dec 2023 14:37:22 -0800
Subject: [PATCH 1/5] Fix test and update docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 docs/compatibility.md                         | 11 +++++++++++
 integration_tests/src/main/python/cmp_test.py | 11 ++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 9d411f56d50..8043aa12d38 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -83,6 +83,17 @@ after Spark 3.1.0.
 We do not disable operations that produce different results due to `-0.0` in the data because it is
 considered to be a rare occurrence.
 
+### `NaN` vs `NaN`
+
+Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are
+considered as one unique value while other times they can be treated as different. The outcome of
+`NaN` comparison can differ in various operations and also changed between Spark versions.
+Our plugin tries to match its output with Apache Spark except for a few operation(s) listed below:
+ - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from
+Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)).
+On the other hand, our plugin always compares all `NaN` as equal value for this operation.
+
+
 ## Decimal Support
 
 Apache Spark supports decimal values with a precision up to 38. This equates to 128-bits.
diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py
index a891b667016..d949f31deed 100644
--- a/integration_tests/src/main/python/cmp_test.py
+++ b/integration_tests/src/main/python/cmp_test.py
@@ -17,7 +17,7 @@
 from asserts import assert_gpu_and_cpu_are_equal_collect
 from conftest import is_not_utc
 from data_gen import *
-from spark_session import with_cpu_session, is_before_spark_330
+from spark_session import with_cpu_session, is_before_spark_313, is_before_spark_330
 from pyspark.sql.types import *
 from marks import datagen_overrides
 import pyspark.sql.functions as f
@@ -346,11 +346,16 @@ def test_in(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars)))
 
+# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparisons.
+# See https://github.com/NVIDIA/spark-rapids/issues/9687.
+test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \
+                                   [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \
+                      if is_before_spark_313() else eq_gens_with_decimal_gen
+
 # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf
 # This is to test entries over that value.
-@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687')
-@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn)
 @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
+@pytest.mark.parametrize('data_gen', test_inset_data_gen, ids=idfn)
 def test_in_set(data_gen):
     # nulls are not supported for in on the GPU yet
     num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1

From c14c00ab8481bc88866b26bbcbd3f3a0d77dfeb5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Fri, 1 Dec 2023 16:06:43 -0700
Subject: [PATCH 2/5] Update docs/compatibility.md

Co-authored-by: Jason Lowe <jlowe@nvidia.com>
---
 docs/compatibility.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 8043aa12d38..53be3655338 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -88,7 +88,7 @@ considered to be a rare occurrence.
 Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are
 considered as one unique value while other times they can be treated as different. The outcome of
 `NaN` comparison can differ in various operations and also changed between Spark versions.
-Our plugin tries to match its output with Apache Spark except for a few operation(s) listed below:
+The RAPIDS Accelerator tries to match its output with Apache Spark except for a few operation(s) listed below:
  - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from
 Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)).
 On the other hand, our plugin always compares all `NaN` as equal value for this operation.

From 176dc4a79adab08eb3c71dde3dff1a459146f3a1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Fri, 1 Dec 2023 16:07:28 -0700
Subject: [PATCH 3/5] Update docs/compatibility.md

Co-authored-by: Jason Lowe <jlowe@nvidia.com>
---
 docs/compatibility.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 53be3655338..82a85c9d490 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -89,9 +89,10 @@ Apache Spark does not have a consistent way to handle `NaN` comparison. Sometime
 considered as one unique value while other times they can be treated as different. The outcome of
 `NaN` comparison can differ in various operations and also changed between Spark versions.
 The RAPIDS Accelerator tries to match its output with Apache Spark except for a few operation(s) listed below:
- - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from
-Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)).
-On the other hand, our plugin always compares all `NaN` as equal value for this operation.
+ -  - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 and
+ prior versions, see [SPARK-36792](https://issues.apache.org/jira/browse/SPARK-36792) for more details.
+The RAPIDS Accelerator compares `NaN` values as equal for this operation which matches
+the behavior of Apache Spark 3.1.3 and later versions.
 
 
 ## Decimal Support

From a485401caf5408e9caf640685fd601e9575b5d7d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Dec 2023 15:08:18 -0800
Subject: [PATCH 4/5] Fix docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 docs/compatibility.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 82a85c9d490..4e1d604b1ea 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -89,7 +89,7 @@ Apache Spark does not have a consistent way to handle `NaN` comparison. Sometime
 considered as one unique value while other times they can be treated as different. The outcome of
 `NaN` comparison can differ in various operations and also changed between Spark versions.
 The RAPIDS Accelerator tries to match its output with Apache Spark except for a few operation(s) listed below:
- -  - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 and
+ - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 and
  prior versions, see [SPARK-36792](https://issues.apache.org/jira/browse/SPARK-36792) for more details.
 The RAPIDS Accelerator compares `NaN` values as equal for this operation which matches
 the behavior of Apache Spark 3.1.3 and later versions.

From 138cb7f466b70aa8d47ae32f276f1bdc252352bd Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Fri, 1 Dec 2023 16:10:08 -0700
Subject: [PATCH 5/5] Update integration_tests/src/main/python/cmp_test.py

---
 integration_tests/src/main/python/cmp_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py
index d949f31deed..f2e08339363 100644
--- a/integration_tests/src/main/python/cmp_test.py
+++ b/integration_tests/src/main/python/cmp_test.py
@@ -346,7 +346,7 @@ def test_in(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars)))
 
-# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparisons.
+# We avoid testing inset with NaN in Spark < 3.1.3 since it has issue with NaN comparisons.
 # See https://github.com/NVIDIA/spark-rapids/issues/9687.
 test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \
                                    [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \