From fb8463c02ba129d85ae7433d255cea005e793451 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 28 Jul 2020 07:44:36 -0500 Subject: [PATCH 1/4] Added in StringRPad and StringLPad Signed-off-by: Robert (Bobby) Evans --- docs/configs.md | 2 + integration_tests/src/main/python/data_gen.py | 4 +- .../src/main/python/string_test.py | 22 ++++++ .../nvidia/spark/rapids/GpuOverrides.scala | 44 +++++++++++ .../spark/sql/rapids/stringFunctions.scala | 75 ++++++++++++++++++- 5 files changed, 144 insertions(+), 3 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 4720050e5f8..d667b03f0d3 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -183,7 +183,9 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.SpecifiedWindowFrame| |specification of the width of the group (or "frame") of input rows around which a window function is evaluated|true|None| spark.rapids.sql.expression.Sqrt|`sqrt`|square root|true|None| spark.rapids.sql.expression.StartsWith| |Starts With|true|None| +spark.rapids.sql.expression.StringLPad|`lpad`|Pad a string on the left|true|None| spark.rapids.sql.expression.StringLocate|`position`, `locate`|Substring search operator|true|None| +spark.rapids.sql.expression.StringRPad|`rpad`|Pad a string on the right|true|None| spark.rapids.sql.expression.StringReplace|`replace`|StringReplace operator|true|None| spark.rapids.sql.expression.StringTrim|`trim`|StringTrim operator|true|None| spark.rapids.sql.expression.StringTrimLeft|`ltrim`|StringTrimLeft operator|true|None| diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index cdd637ca122..b8230dde452 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -160,7 +160,7 @@ def with_special_pattern(self, pattern, flags=0, charset=sre_yield.CHARSET, weig length = int(len(strs)) except OverflowError: length = _MAX_CHOICES - return self.with_special_case(lambda rand : strs[rand.randint(0, length)], weight=weight) + return self.with_special_case(lambda rand : strs[rand.randrange(0, length)], weight=weight) def start(self, rand): strs = self.base_strs @@ -168,7 +168,7 @@ def start(self, rand): length = int(len(strs)) except OverflowError: length = _MAX_CHOICES - self._start(rand, lambda : strs[rand.randint(0, length)]) + self._start(rand, lambda : strs[rand.randrange(0, length)]) _BYTE_MIN = -(1 << 7) _BYTE_MAX = (1 << 7) - 1 diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 743ced11ceb..7cc03295dc8 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -35,6 +35,28 @@ def test_substring_index(data_gen,delim): f.substring_index(f.col('a'), delim, -1), f.substring_index(f.col('a'), delim, -4))) +# ONLY LITERAL WIDTH AND PAD ARE SUPPORTED +def test_lpad(): + gen = mk_str_gen('.{0,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'LPAD(a, 2, " ")', + 'LPAD(a, NULL, " ")', + 'LPAD(a, 5, NULL)', + 'LPAD(a, 5, "G")', + 'LPAD(a, -1, "G")')) + +# ONLY LITERAL WIDTH AND PAD ARE SUPPORTED +def test_rpad(): + gen = mk_str_gen('.{0,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'RPAD(a, 2, " ")', + 'RPAD(a, NULL, " ")', + 'RPAD(a, 5, NULL)', + 'RPAD(a, 5, "G")', + 'RPAD(a, -1, "G")')) + # ONLY LITERAL SEARCH PARAMS ARE SUPPORTED def test_position(): gen = mk_str_gen('.{0,3}Z_Z.{0,3}A.{0,3}') diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 8de6e9ffeac..347dbc28470 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1282,6 +1282,50 @@ object GpuOverrides { override def convertToGpu(child: Expression): GpuExpression = GpuLower(child) }) .incompat(CASE_MODIFICATION_INCOMPAT), + expr[StringLPad]( + "Pad a string on the left", + (in, conf, p, r) => new TernaryExprMeta[StringLPad](in, conf, p, r) { + override def tagExprForGpu(): Unit = { + if (!isLit(in.len)) { + willNotWorkOnGpu("only literal length is supported") + } + + val padLit = extractLit(in.pad) + if (padLit.isEmpty) { + willNotWorkOnGpu("only literal pad is supported") + } else if (padLit.get.value != null && + padLit.get.value.asInstanceOf[UTF8String].toString.length != 1) { + willNotWorkOnGpu("only a single character is supported for pad") + } + } + override def convertToGpu( + str: Expression, + width: Expression, + pad: Expression): GpuExpression = + GpuStringLPad(str, width, pad) + }), + expr[StringRPad]( + "Pad a string on the right", + (in, conf, p, r) => new TernaryExprMeta[StringRPad](in, conf, p, r) { + override def tagExprForGpu(): Unit = { + if (!isLit(in.len)) { + willNotWorkOnGpu("only literal length is supported") + } + + val padLit = extractLit(in.pad) + if (padLit.isEmpty) { + willNotWorkOnGpu("only literal pad is supported") + } else if (padLit.get.value != null && + padLit.get.value.asInstanceOf[UTF8String].toString.length != 1) { + willNotWorkOnGpu("only a single character is supported for pad") + } + } + override def convertToGpu( + str: Expression, + width: Expression, + pad: Expression): GpuExpression = + GpuStringRPad(str, width, pad) + }), expr[StringLocate]( "Substring search operator", (in, conf, p, r) => new TernaryExprMeta[StringLocate](in, conf, p, r) { diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 4805733cee4..9f05a2dfea3 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.rapids import scala.collection.mutable.ArrayBuffer -import ai.rapids.cudf.{ColumnVector, Scalar, Table} +import ai.rapids.cudf.{ColumnVector, DType, PadSide, Scalar, Table} import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.RapidsPluginImplicits._ @@ -696,3 +696,76 @@ case class GpuSubstringIndex(strExpr: Expression, "Internal Error: this version of substring index is not supported") } +trait BasePad extends GpuTernaryExpression with ImplicitCastInputTypes with NullIntolerant { + + val str: Expression + val len: Expression + val pad: Expression + val direction: PadSide + + override def children: Seq[Expression] = str :: len :: pad :: Nil + override def dataType: DataType = StringType + override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, StringType) + override def prettyName: String = "lpad" + + override def doColumnar(str: GpuColumnVector, len: Scalar, pad: Scalar): GpuColumnVector = { + if (len.isValid && pad.isValid) { + val l = math.max(0, len.getInt) + withResource(str.getBase.pad(l, direction, pad.getJavaString)) { padded => + GpuColumnVector.from(padded.substring(0, l)) + } + } else { + withResource(Scalar.fromNull(DType.STRING)) { ns => + GpuColumnVector.from(ColumnVector.fromScalar(ns, str.getRowCount.toInt)) + } + } + } + + override def doColumnar( + str: GpuColumnVector, + len: GpuColumnVector, + pad: GpuColumnVector): GpuColumnVector = + throw new IllegalStateException("This is not supported yet") + + override def doColumnar( + str: Scalar, + len: GpuColumnVector, + pad: GpuColumnVector): GpuColumnVector = + throw new IllegalStateException("This is not supported yet") + + override def doColumnar(str: Scalar, len: Scalar, pad: GpuColumnVector): GpuColumnVector = + throw new IllegalStateException("This is not supported yet") + + override def doColumnar(str: Scalar, len: GpuColumnVector, pad: Scalar): GpuColumnVector = + throw new IllegalStateException("This is not supported yet") + + override def doColumnar( + str: GpuColumnVector, + len: Scalar, + pad: GpuColumnVector): GpuColumnVector = + throw new IllegalStateException("This is not supported yet") + + override def doColumnar( + str: GpuColumnVector, + len: GpuColumnVector, + pad: Scalar): GpuColumnVector = + throw new IllegalStateException("This is not supported yet") +} + +case class GpuStringLPad(str: Expression, len: Expression, pad: Expression) + extends BasePad { + val direction = PadSide.LEFT + + def this(str: Expression, len: Expression) = { + this(str, len, GpuLiteral(" ", StringType)) + } +} + +case class GpuStringRPad(str: Expression, len: Expression, pad: Expression) + extends BasePad { + val direction = PadSide.RIGHT + + def this(str: Expression, len: Expression) = { + this(str, len, GpuLiteral(" ", StringType)) + } +} \ No newline at end of file From f3c51fc862be66535144514312415e4b290c344a Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 28 Jul 2020 09:09:15 -0500 Subject: [PATCH 2/4] Addressed review comments Signed-off-by: Robert (Bobby) Evans --- .../spark/sql/rapids/stringFunctions.scala | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 9f05a2dfea3..079f2f03f18 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -697,7 +697,6 @@ case class GpuSubstringIndex(strExpr: Expression, } trait BasePad extends GpuTernaryExpression with ImplicitCastInputTypes with NullIntolerant { - val str: Expression val len: Expression val pad: Expression @@ -706,7 +705,6 @@ trait BasePad extends GpuTernaryExpression with ImplicitCastInputTypes with Null override def children: Seq[Expression] = str :: len :: pad :: Nil override def dataType: DataType = StringType override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, StringType) - override def prettyName: String = "lpad" override def doColumnar(str: GpuColumnVector, len: Scalar, pad: Scalar): GpuColumnVector = { if (len.isValid && pad.isValid) { @@ -722,15 +720,15 @@ trait BasePad extends GpuTernaryExpression with ImplicitCastInputTypes with Null } override def doColumnar( - str: GpuColumnVector, - len: GpuColumnVector, - pad: GpuColumnVector): GpuColumnVector = + str: GpuColumnVector, + len: GpuColumnVector, + pad: GpuColumnVector): GpuColumnVector = throw new IllegalStateException("This is not supported yet") override def doColumnar( - str: Scalar, - len: GpuColumnVector, - pad: GpuColumnVector): GpuColumnVector = + str: Scalar, + len: GpuColumnVector, + pad: GpuColumnVector): GpuColumnVector = throw new IllegalStateException("This is not supported yet") override def doColumnar(str: Scalar, len: Scalar, pad: GpuColumnVector): GpuColumnVector = @@ -740,21 +738,22 @@ trait BasePad extends GpuTernaryExpression with ImplicitCastInputTypes with Null throw new IllegalStateException("This is not supported yet") override def doColumnar( - str: GpuColumnVector, - len: Scalar, - pad: GpuColumnVector): GpuColumnVector = + str: GpuColumnVector, + len: Scalar, + pad: GpuColumnVector): GpuColumnVector = throw new IllegalStateException("This is not supported yet") override def doColumnar( - str: GpuColumnVector, - len: GpuColumnVector, - pad: Scalar): GpuColumnVector = + str: GpuColumnVector, + len: GpuColumnVector, + pad: Scalar): GpuColumnVector = throw new IllegalStateException("This is not supported yet") } case class GpuStringLPad(str: Expression, len: Expression, pad: Expression) extends BasePad { val direction = PadSide.LEFT + override def prettyName: String = "lpad" def this(str: Expression, len: Expression) = { this(str, len, GpuLiteral(" ", StringType)) @@ -764,6 +763,7 @@ case class GpuStringLPad(str: Expression, len: Expression, pad: Expression) case class GpuStringRPad(str: Expression, len: Expression, pad: Expression) extends BasePad { val direction = PadSide.RIGHT + override def prettyName: String = "rpad" def this(str: Expression, len: Expression) = { this(str, len, GpuLiteral(" ", StringType)) From 6671d64b289befce9227e294fc49614d496d6ffa Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 28 Jul 2020 09:44:43 -0500 Subject: [PATCH 3/4] More nit picking Signed-off-by: Robert (Bobby) Evans --- .../spark/sql/rapids/stringFunctions.scala | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 079f2f03f18..1fbfe8cb571 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -720,15 +720,15 @@ trait BasePad extends GpuTernaryExpression with ImplicitCastInputTypes with Null } override def doColumnar( - str: GpuColumnVector, - len: GpuColumnVector, - pad: GpuColumnVector): GpuColumnVector = + str: GpuColumnVector, + len: GpuColumnVector, + pad: GpuColumnVector): GpuColumnVector = throw new IllegalStateException("This is not supported yet") override def doColumnar( - str: Scalar, - len: GpuColumnVector, - pad: GpuColumnVector): GpuColumnVector = + str: Scalar, + len: GpuColumnVector, + pad: GpuColumnVector): GpuColumnVector = throw new IllegalStateException("This is not supported yet") override def doColumnar(str: Scalar, len: Scalar, pad: GpuColumnVector): GpuColumnVector = @@ -738,15 +738,15 @@ trait BasePad extends GpuTernaryExpression with ImplicitCastInputTypes with Null throw new IllegalStateException("This is not supported yet") override def doColumnar( - str: GpuColumnVector, - len: Scalar, - pad: GpuColumnVector): GpuColumnVector = + str: GpuColumnVector, + len: Scalar, + pad: GpuColumnVector): GpuColumnVector = throw new IllegalStateException("This is not supported yet") override def doColumnar( - str: GpuColumnVector, - len: GpuColumnVector, - pad: Scalar): GpuColumnVector = + str: GpuColumnVector, + len: GpuColumnVector, + pad: Scalar): GpuColumnVector = throw new IllegalStateException("This is not supported yet") } From c40e782bfa541257543eabd7f5da17a6d4f433fc Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 28 Jul 2020 09:57:42 -0500 Subject: [PATCH 4/4] Now the nits have nits Signed-off-by: Robert (Bobby) Evans --- .../scala/com/nvidia/spark/rapids/GpuOverrides.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 347dbc28470..81fd95f9286 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1299,9 +1299,9 @@ object GpuOverrides { } } override def convertToGpu( - str: Expression, - width: Expression, - pad: Expression): GpuExpression = + str: Expression, + width: Expression, + pad: Expression): GpuExpression = GpuStringLPad(str, width, pad) }), expr[StringRPad]( @@ -1321,9 +1321,9 @@ object GpuOverrides { } } override def convertToGpu( - str: Expression, - width: Expression, - pad: Expression): GpuExpression = + str: Expression, + width: Expression, + pad: Expression): GpuExpression = GpuStringRPad(str, width, pad) }), expr[StringLocate](