From 6cd96d8a58b01f6619661f840f6a269eb8c4cb13 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 11 May 2022 18:15:40 +0800 Subject: [PATCH 1/2] Add test cases for casting string to date in ANSI mode Signed-off-by: Chong Gao --- integration_tests/src/main/python/cast_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index 62a77ae2004..d6d0414c811 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -65,6 +65,22 @@ def test_cast_string_date_valid_format(): lambda spark : unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(DateType())), conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) +@pytest.mark.skipif(is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+") +@pytest.mark.parametrize('invalid', ['200', '1970A', '1970 A', '1970T', '1970 T', '1970-01T', '1970-01 A', + '1970-01-01A', # 1970-01-01T is OK, 1970-01-01A is NOK + '2022-02-29', # nonexistent day + '200-1-1', + '2001-13-1', # nonexistent day + '2001-1-32', # nonexistent day + '2001-1-32' # nonexistent day + ]) +def test_cast_string_date_invalid_ansi(invalid): + assert_gpu_and_cpu_error( + lambda spark: spark.createDataFrame([(invalid,)], "a string").select(f.col('a').cast(DateType())).collect(), + conf={'spark.rapids.sql.hasExtendedYearValues': 'false', + 'spark.sql.ansi.enabled': 'true'}, + error_message="DateTimeException") + def test_cast_string_ts_valid_format(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. From 79f102cba6a46f3c0dccf0c9f509d3e72aa54232 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 12 May 2022 10:11:52 +0800 Subject: [PATCH 2/2] Update test cases --- .../src/main/python/cast_test.py | 44 +++++++++++++++---- .../com/nvidia/spark/rapids/CastOpSuite.scala | 13 ------ 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index d6d0414c811..832857126c1 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -65,15 +65,36 @@ def test_cast_string_date_valid_format(): lambda spark : unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(DateType())), conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) +invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim + '1970 T', ' 1970-01T', '1970-01 A', # not conform to "yyyy-[M]M" after trim + # not conform to 'yyyy-[M]M-[d]d', "yyyy-[M]M-[d]d *" or "yyyy-[M]M-[d]d T*" after trim + '1970-01-01A', + '2022-02-29', # nonexistent day + '200-1-1', # 200 not conform to 'YYYY' + '2001-13-1', # nonexistent day + '2001-1-32', # nonexistent day + 'not numbers', + '666666666' + ] +valid_values_string_to_date = ['2001', ' 2001 ', '1970-01', ' 1970-1 ', + '1970-1-01', ' 1970-10-5 ', ' 2001-10-16 ', # 'yyyy-[M]M-[d]d' after trim + '1970-01-01T', '1970-01-01T-no_impact', # "yyyy-[M]M-[d]d T*" after trim + ' 1970-01-01 A', '1970-01-01 B ' # "yyyy-[M]M-[d]d *" after trim + ] +values_string_to_data = invalid_values_string_to_date + valid_values_string_to_date + +# test Spark Spark versions < 3.2.0, ANSI mode +@pytest.mark.skipif(not is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+") +def test_cast_string_date_invalid_ansi_before_320(): + data_rows = [(v,) for v in values_string_to_data] + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), + conf={'spark.rapids.sql.hasExtendedYearValues': 'false', + 'spark.sql.ansi.enabled': 'true'}, ) + +# test Spark versions >= 320, ANSI mode @pytest.mark.skipif(is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+") -@pytest.mark.parametrize('invalid', ['200', '1970A', '1970 A', '1970T', '1970 T', '1970-01T', '1970-01 A', - '1970-01-01A', # 1970-01-01T is OK, 1970-01-01A is NOK - '2022-02-29', # nonexistent day - '200-1-1', - '2001-13-1', # nonexistent day - '2001-1-32', # nonexistent day - '2001-1-32' # nonexistent day - ]) +@pytest.mark.parametrize('invalid', invalid_values_string_to_date) def test_cast_string_date_invalid_ansi(invalid): assert_gpu_and_cpu_error( lambda spark: spark.createDataFrame([(invalid,)], "a string").select(f.col('a').cast(DateType())).collect(), @@ -81,6 +102,13 @@ def test_cast_string_date_invalid_ansi(invalid): 'spark.sql.ansi.enabled': 'true'}, error_message="DateTimeException") +# test all Spark versions, non ANSI mode, invalid value will be converted to NULL +def test_cast_string_date_non_ansi(): + data_rows = [(v,) for v in values_string_to_data] + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), + conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) + def test_cast_string_ts_valid_format(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala index ae4db3996a6..71f69a8811e 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala @@ -160,19 +160,6 @@ class CastOpSuite extends GpuExpressionTestSuite { generateRandomStrings(Some(DATE_CHARS), maxStringLen = 8, Some("2021"))) } - test("Cast from string to date ANSI mode with valid values") { - testCastStringTo(DataTypes.DateType, Seq("2021-01-01", "2021-02-01"), - ansiMode = AnsiExpectSuccess) - } - - test("Cast from string to date ANSI mode with invalid values") { - assumeSpark320orLater - // test the values individually - Seq("2021-20-60", "not numbers", "666666666").foreach { value => - testCastStringTo(DataTypes.DateType, Seq(value), ansiMode = AnsiExpectFailure) - } - } - test("Cast from string to timestamp") { testCastStringTo(DataTypes.TimestampType, timestampsAsStringsSeq(castStringToTimestamp = true, validOnly = false))