From 5523e50e0997be968fd3325545d7fcbd8b7234a1 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 1 Dec 2023 14:37:22 -0800 Subject: [PATCH 1/5] Fix test and update docs Signed-off-by: Nghia Truong --- docs/compatibility.md | 11 +++++++++++ integration_tests/src/main/python/cmp_test.py | 11 ++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 9d411f56d50..8043aa12d38 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -83,6 +83,17 @@ after Spark 3.1.0. We do not disable operations that produce different results due to `-0.0` in the data because it is considered to be a rare occurrence. +### `NaN` vs `NaN` + +Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are +considered as one unique value while other times they can be treated as different. The outcome of +`NaN` comparison can differ in various operations and also changed between Spark versions. +Our plugin tries to match its output with Apache Spark except for a few operation(s) listed below: + - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from +Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)). +On the other hand, our plugin always compares all `NaN` as equal value for this operation. + + ## Decimal Support Apache Spark supports decimal values with a precision up to 38. This equates to 128-bits. diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index a891b667016..d949f31deed 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -17,7 +17,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect from conftest import is_not_utc from data_gen import * -from spark_session import with_cpu_session, is_before_spark_330 +from spark_session import with_cpu_session, is_before_spark_313, is_before_spark_330 from pyspark.sql.types import * from marks import datagen_overrides import pyspark.sql.functions as f @@ -346,11 +346,16 @@ def test_in(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars))) +# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparisons. +# See https://github.com/NVIDIA/spark-rapids/issues/9687. +test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \ + [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \ + if is_before_spark_313() else eq_gens_with_decimal_gen + # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf # This is to test entries over that value. -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687') -@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.parametrize('data_gen', test_inset_data_gen, ids=idfn) def test_in_set(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1 From c14c00ab8481bc88866b26bbcbd3f3a0d77dfeb5 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 1 Dec 2023 16:06:43 -0700 Subject: [PATCH 2/5] Update docs/compatibility.md Co-authored-by: Jason Lowe --- docs/compatibility.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 8043aa12d38..53be3655338 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -88,7 +88,7 @@ considered to be a rare occurrence. Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are considered as one unique value while other times they can be treated as different. The outcome of `NaN` comparison can differ in various operations and also changed between Spark versions. -Our plugin tries to match its output with Apache Spark except for a few operation(s) listed below: +The RAPIDS Accelerator tries to match its output with Apache Spark except for a few operation(s) listed below: - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)). On the other hand, our plugin always compares all `NaN` as equal value for this operation. From 176dc4a79adab08eb3c71dde3dff1a459146f3a1 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 1 Dec 2023 16:07:28 -0700 Subject: [PATCH 3/5] Update docs/compatibility.md Co-authored-by: Jason Lowe --- docs/compatibility.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 53be3655338..82a85c9d490 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -89,9 +89,10 @@ Apache Spark does not have a consistent way to handle `NaN` comparison. Sometime considered as one unique value while other times they can be treated as different. The outcome of `NaN` comparison can differ in various operations and also changed between Spark versions. The RAPIDS Accelerator tries to match its output with Apache Spark except for a few operation(s) listed below: - - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from -Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)). -On the other hand, our plugin always compares all `NaN` as equal value for this operation. + - - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 and + prior versions, see [SPARK-36792](https://issues.apache.org/jira/browse/SPARK-36792) for more details. +The RAPIDS Accelerator compares `NaN` values as equal for this operation which matches +the behavior of Apache Spark 3.1.3 and later versions. ## Decimal Support From a485401caf5408e9caf640685fd601e9575b5d7d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 1 Dec 2023 15:08:18 -0800 Subject: [PATCH 4/5] Fix docs Signed-off-by: Nghia Truong --- docs/compatibility.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 82a85c9d490..4e1d604b1ea 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -89,7 +89,7 @@ Apache Spark does not have a consistent way to handle `NaN` comparison. Sometime considered as one unique value while other times they can be treated as different. The outcome of `NaN` comparison can differ in various operations and also changed between Spark versions. The RAPIDS Accelerator tries to match its output with Apache Spark except for a few operation(s) listed below: - - - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 and + - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 and prior versions, see [SPARK-36792](https://issues.apache.org/jira/browse/SPARK-36792) for more details. The RAPIDS Accelerator compares `NaN` values as equal for this operation which matches the behavior of Apache Spark 3.1.3 and later versions. From 138cb7f466b70aa8d47ae32f276f1bdc252352bd Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 1 Dec 2023 16:10:08 -0700 Subject: [PATCH 5/5] Update integration_tests/src/main/python/cmp_test.py --- integration_tests/src/main/python/cmp_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index d949f31deed..f2e08339363 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -346,7 +346,7 @@ def test_in(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars))) -# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparisons. +# We avoid testing inset with NaN in Spark < 3.1.3 since it has issue with NaN comparisons. # See https://github.com/NVIDIA/spark-rapids/issues/9687. test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \ [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \