From cb48046726a66f9aac9d54c767a01b71b4e03015 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 12 Feb 2021 11:00:29 -0700 Subject: [PATCH 1/8] CAST string to temporal types now uses isTimestamp to detect valid inputs Signed-off-by: Andy Grove --- .../com/nvidia/spark/rapids/GpuCast.scala | 45 ++++++++++++++----- .../com/nvidia/spark/rapids/CastOpSuite.scala | 4 ++ 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 6a3f8624a8d..b1e23a78975 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -637,10 +637,16 @@ case class GpuCast( regex: String, cudfFormat: String): ColumnVector = { - withResource(Scalar.fromNull(DType.TIMESTAMP_DAYS)) { nullScalar => - withResource(input.matchesRe(regex)) { isMatch => - withResource(input.asTimestampDays(cudfFormat)) { asDays => - isMatch.ifElse(asDays, nullScalar) + val isValidDate = withResource(input.matchesRe(regex)) { isMatch => + withResource(input.isTimestamp(cudfFormat)) { isTimestamp => + isMatch.and(isTimestamp) + } + } + + withResource(isValidDate) { isValidDate => + withResource(input.asTimestampDays(cudfFormat)) { asDays => + withResource(Scalar.fromNull(DType.TIMESTAMP_DAYS)) { nullScalar => + isValidDate.ifElse(asDays, nullScalar) } } } @@ -653,10 +659,16 @@ case class GpuCast( cudfFormat: String, orElse: ColumnVector): ColumnVector = { - withResource(input.matchesRe(regex)) { isMatch => + val isValidDate = withResource(input.matchesRe(regex)) { isMatch => + withResource(input.isTimestamp(cudfFormat)) { isTimestamp => + isMatch.and(isTimestamp) + } + } + + withResource(isValidDate) { isValidDate => withResource(input.asTimestampDays(cudfFormat)) { asDays => withResource(orElse) { orElse => - isMatch.ifElse(asDays, orElse) + isValidDate.ifElse(asDays, orElse) } } } @@ -721,10 +733,16 @@ case class GpuCast( regex: String, cudfFormat: String): ColumnVector = { - withResource(Scalar.fromNull(DType.TIMESTAMP_MICROSECONDS)) { nullScalar => - withResource(input.matchesRe(regex)) { isMatch => + val isValidTimestamp = withResource(input.matchesRe(regex)) { isMatch => + withResource(input.isTimestamp(cudfFormat)) { isTimestamp => + isMatch.and(isTimestamp) + } + } + + withResource(isValidTimestamp) { isValidTimestamp => + withResource(Scalar.fromNull(DType.TIMESTAMP_MICROSECONDS)) { nullScalar => withResource(input.asTimestampMicroseconds(cudfFormat)) { asDays => - isMatch.ifElse(asDays, nullScalar) + isValidTimestamp.ifElse(asDays, nullScalar) } } } @@ -737,10 +755,15 @@ case class GpuCast( cudfFormat: String, orElse: ColumnVector): ColumnVector = { - withResource(input.matchesRe(regex)) { isMatch => + val isValidTimestamp = withResource(input.matchesRe(regex)) { isMatch => + withResource(input.isTimestamp(cudfFormat)) { isTimestamp => + isMatch.and(isTimestamp) + } + } + withResource(isValidTimestamp) { isValidTimestamp => withResource(input.asTimestampMicroseconds(cudfFormat)) { asDays => withResource(orElse) { orElse => - isMatch.ifElse(asDays, orElse) + isValidTimestamp.ifElse(asDays, orElse) } } } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala index 879382a6901..41aa03cc376 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala @@ -943,6 +943,10 @@ object CastOpSuite { "2018-1random_text", "2018-11-08random_text", "2018-11-9random_text", + // date component out of range + "2020-13-01", + "2020-12-32", + "2020-02-30", // `yyyy-[m]m-[d]dT*` in Spark 3.1+ these no longer work for AnsiCast, but did before "2010-1-01T!@#$%", "2010-1-02T,", From 53c917b126af4826c64d854e7323bb1fe08b4e76 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 12 Feb 2021 17:02:04 -0700 Subject: [PATCH 2/8] Bug fix in timestamp formats --- .../scala/com/nvidia/spark/rapids/GpuCast.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index b1e23a78975..df674f6e295 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -97,8 +97,10 @@ object GpuCast { private val TIMESTAMP_REGEX_YYYY = "\\A\\d{4}\\Z" private val TIMESTAMP_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{2}[ ]?\\Z" private val TIMESTAMP_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ ]?\\Z" - private val TIMESTAMP_REGEX_FULL = - "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z\\Z" + private val TIMESTAMP_REGEX_FULL_1 = + "\\A\\d{4}\\-\\d{2}\\-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z\\Z" + private val TIMESTAMP_REGEX_FULL_2 = + "\\A\\d{4}\\-\\d{2}\\-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z\\Z" private val TIMESTAMP_REGEX_NO_DATE = "\\A[T]?(\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z)\\Z" /** @@ -802,10 +804,11 @@ case class GpuCast( // convert dates that are in valid timestamp formats val converted = - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_FULL, "%Y-%m-%dT%H:%M:%SZ%f", - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM_DD, "%Y-%m-%d", - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM, "%Y-%m", - convertTimestampOrNull(sanitizedInput, TIMESTAMP_REGEX_YYYY, "%Y")))) + convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_FULL_1, "%Y-%m-%d %H:%M:%S.%f", + convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_FULL_2, "%Y-%m-%dT%H:%M:%S.%f", + convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM_DD, "%Y-%m-%d", + convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM, "%Y-%m", + convertTimestampOrNull(sanitizedInput, TIMESTAMP_REGEX_YYYY, "%Y"))))) // handle special dates like "epoch", "now", etc. val finalResult = specialDates.foldLeft(converted)((prev, specialDate) => From 0f3520f1168dabb990e26f8e23315af37bc0edd8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 16 Feb 2021 10:35:39 -0700 Subject: [PATCH 3/8] Update copyright and add additional test cases Signed-off-by: Andy Grove --- .../src/main/scala/com/nvidia/spark/rapids/GpuCast.scala | 2 +- .../src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index df674f6e295..da12e179524 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala index 41aa03cc376..12f36d3d5ff 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -947,6 +947,8 @@ object CastOpSuite { "2020-13-01", "2020-12-32", "2020-02-30", + "2030-00-11 12:02:03.012345Z", + "2030-00-11T12:02:03.012345Z", // `yyyy-[m]m-[d]dT*` in Spark 3.1+ these no longer work for AnsiCast, but did before "2010-1-01T!@#$%", "2010-1-02T,", From 48e1c9ce184b14248a26fe5006a33371a143f7cf Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 16 Feb 2021 11:26:34 -0700 Subject: [PATCH 4/8] Refactor to avoid additional regex call Signed-off-by: Andy Grove --- .../com/nvidia/spark/rapids/GpuCast.scala | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index da12e179524..6fa5c106c8b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -97,10 +97,8 @@ object GpuCast { private val TIMESTAMP_REGEX_YYYY = "\\A\\d{4}\\Z" private val TIMESTAMP_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{2}[ ]?\\Z" private val TIMESTAMP_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ ]?\\Z" - private val TIMESTAMP_REGEX_FULL_1 = - "\\A\\d{4}\\-\\d{2}\\-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z\\Z" - private val TIMESTAMP_REGEX_FULL_2 = - "\\A\\d{4}\\-\\d{2}\\-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z\\Z" + private val TIMESTAMP_REGEX_FULL = + "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z\\Z" private val TIMESTAMP_REGEX_NO_DATE = "\\A[T]?(\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z)\\Z" /** @@ -771,6 +769,37 @@ case class GpuCast( } } + /** This method does not close the `input` ColumnVector. */ + def convertTimestampFullOr( + input: ColumnVector, + orElse: ColumnVector): ColumnVector = { + + val cudfFormat1 = "%Y-%m-%d %H:%M:%S.%f" + val cudfFormat2 = "%Y-%m-%dT%H:%M:%S.%f" + + // valid dates must match the regex and either of the cuDF formats + val isCudfMatch = withResource(input.isTimestamp(cudfFormat1)) { isTimestamp1 => + withResource(input.isTimestamp(cudfFormat2)) { isTimestamp2 => + isTimestamp1.or(isTimestamp2) + } + } + val isValidTimestamp = withResource(isCudfMatch) { isCudfMatch => + withResource(input.matchesRe(TIMESTAMP_REGEX_FULL)) { isRegexMatch => + isCudfMatch.and(isRegexMatch) + } + } + + // we only need to parse with one of the cuDF formats because the parsing code ignores + // the ' ' or 'T' between the date and time components + withResource(isValidTimestamp) { isValidTimestamp => + withResource(input.asTimestampMicroseconds(cudfFormat1)) { asDays => + withResource(orElse) { orElse => + isValidTimestamp.ifElse(asDays, orElse) + } + } + } + } + // special timestamps val today = DateUtils.currentDate() val todayStr = new SimpleDateFormat("yyyy-MM-dd") @@ -801,14 +830,12 @@ case class GpuCast( } withResource(sanitizedInput) { sanitizedInput => - // convert dates that are in valid timestamp formats val converted = - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_FULL_1, "%Y-%m-%d %H:%M:%S.%f", - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_FULL_2, "%Y-%m-%dT%H:%M:%S.%f", - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM_DD, "%Y-%m-%d", - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM, "%Y-%m", - convertTimestampOrNull(sanitizedInput, TIMESTAMP_REGEX_YYYY, "%Y"))))) + convertTimestampFullOr(sanitizedInput, + convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM_DD, "%Y-%m-%d", + convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM, "%Y-%m", + convertTimestampOrNull(sanitizedInput, TIMESTAMP_REGEX_YYYY, "%Y")))) // handle special dates like "epoch", "now", etc. val finalResult = specialDates.foldLeft(converted)((prev, specialDate) => From 2b0d86ceec1a2bf94ce6ccc5f765e89656dadd15 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 16 Feb 2021 11:40:51 -0700 Subject: [PATCH 5/8] Remove regex for full timestamp match Signed-off-by: Andy Grove --- .../src/main/scala/com/nvidia/spark/rapids/GpuCast.scala | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 6fa5c106c8b..1a269f59c8a 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -97,8 +97,6 @@ object GpuCast { private val TIMESTAMP_REGEX_YYYY = "\\A\\d{4}\\Z" private val TIMESTAMP_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{2}[ ]?\\Z" private val TIMESTAMP_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ ]?\\Z" - private val TIMESTAMP_REGEX_FULL = - "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z\\Z" private val TIMESTAMP_REGEX_NO_DATE = "\\A[T]?(\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z)\\Z" /** @@ -778,16 +776,11 @@ case class GpuCast( val cudfFormat2 = "%Y-%m-%dT%H:%M:%S.%f" // valid dates must match the regex and either of the cuDF formats - val isCudfMatch = withResource(input.isTimestamp(cudfFormat1)) { isTimestamp1 => + val isValidTimestamp = withResource(input.isTimestamp(cudfFormat1)) { isTimestamp1 => withResource(input.isTimestamp(cudfFormat2)) { isTimestamp2 => isTimestamp1.or(isTimestamp2) } } - val isValidTimestamp = withResource(isCudfMatch) { isCudfMatch => - withResource(input.matchesRe(TIMESTAMP_REGEX_FULL)) { isRegexMatch => - isCudfMatch.and(isRegexMatch) - } - } // we only need to parse with one of the cuDF formats because the parsing code ignores // the ' ' or 'T' between the date and time components From 7316576924962fc070ccf926a1cb944127deaa9b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 16 Feb 2021 11:42:34 -0700 Subject: [PATCH 6/8] Update comment --- sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 1a269f59c8a..b7d5b25aed1 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -775,7 +775,7 @@ case class GpuCast( val cudfFormat1 = "%Y-%m-%d %H:%M:%S.%f" val cudfFormat2 = "%Y-%m-%dT%H:%M:%S.%f" - // valid dates must match the regex and either of the cuDF formats + // valid dates must match either of the cuDF formats val isValidTimestamp = withResource(input.isTimestamp(cudfFormat1)) { isTimestamp1 => withResource(input.isTimestamp(cudfFormat2)) { isTimestamp2 => isTimestamp1.or(isTimestamp2) From 9ab1292a8e858c5ebfc519ea48119533e9d0706e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 16 Feb 2021 14:23:00 -0700 Subject: [PATCH 7/8] Add test data to demonstrate why regex was needed Signed-off-by: Andy Grove --- tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala index 12f36d3d5ff..d9a4efe1a7e 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala @@ -934,6 +934,7 @@ object CastOpSuite { "2010-01-6T 12:34:56.000111Z", "2010-01-6 T 12:34:56.000111Z", "2010-01-6 T12:34:56.000111Z", + "2030-11-11 12:02:03.012345Z TRAILING TEXT", "2010-01-6 ", "2010-01-6 T", "2010-01-6 T\n", From d5e0055320a40eb144f73d411e463426288b42bf Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 16 Feb 2021 14:26:08 -0700 Subject: [PATCH 8/8] Reinstate regex for full timestamp Signed-off-by: Andy Grove --- .../main/scala/com/nvidia/spark/rapids/GpuCast.scala | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index b7d5b25aed1..6fa5c106c8b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -97,6 +97,8 @@ object GpuCast { private val TIMESTAMP_REGEX_YYYY = "\\A\\d{4}\\Z" private val TIMESTAMP_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{2}[ ]?\\Z" private val TIMESTAMP_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ ]?\\Z" + private val TIMESTAMP_REGEX_FULL = + "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z\\Z" private val TIMESTAMP_REGEX_NO_DATE = "\\A[T]?(\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z)\\Z" /** @@ -775,12 +777,17 @@ case class GpuCast( val cudfFormat1 = "%Y-%m-%d %H:%M:%S.%f" val cudfFormat2 = "%Y-%m-%dT%H:%M:%S.%f" - // valid dates must match either of the cuDF formats - val isValidTimestamp = withResource(input.isTimestamp(cudfFormat1)) { isTimestamp1 => + // valid dates must match the regex and either of the cuDF formats + val isCudfMatch = withResource(input.isTimestamp(cudfFormat1)) { isTimestamp1 => withResource(input.isTimestamp(cudfFormat2)) { isTimestamp2 => isTimestamp1.or(isTimestamp2) } } + val isValidTimestamp = withResource(isCudfMatch) { isCudfMatch => + withResource(input.matchesRe(TIMESTAMP_REGEX_FULL)) { isRegexMatch => + isCudfMatch.and(isRegexMatch) + } + } // we only need to parse with one of the cuDF formats because the parsing code ignores // the ' ' or 'T' between the date and time components