Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added in StringRPad and StringLPad #445

Merged
merged 4 commits into from
Jul 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ Name | SQL Function(s) | Description | Default Value | Notes
<a name="sql.expression.SpecifiedWindowFrame"></a>spark.rapids.sql.expression.SpecifiedWindowFrame| |specification of the width of the group (or "frame") of input rows around which a window function is evaluated|true|None|
<a name="sql.expression.Sqrt"></a>spark.rapids.sql.expression.Sqrt|`sqrt`|square root|true|None|
<a name="sql.expression.StartsWith"></a>spark.rapids.sql.expression.StartsWith| |Starts With|true|None|
<a name="sql.expression.StringLPad"></a>spark.rapids.sql.expression.StringLPad|`lpad`|Pad a string on the left|true|None|
<a name="sql.expression.StringLocate"></a>spark.rapids.sql.expression.StringLocate|`position`, `locate`|Substring search operator|true|None|
<a name="sql.expression.StringRPad"></a>spark.rapids.sql.expression.StringRPad|`rpad`|Pad a string on the right|true|None|
<a name="sql.expression.StringReplace"></a>spark.rapids.sql.expression.StringReplace|`replace`|StringReplace operator|true|None|
<a name="sql.expression.StringTrim"></a>spark.rapids.sql.expression.StringTrim|`trim`|StringTrim operator|true|None|
<a name="sql.expression.StringTrimLeft"></a>spark.rapids.sql.expression.StringTrimLeft|`ltrim`|StringTrimLeft operator|true|None|
Expand Down
4 changes: 2 additions & 2 deletions integration_tests/src/main/python/data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,15 +160,15 @@ def with_special_pattern(self, pattern, flags=0, charset=sre_yield.CHARSET, weig
length = int(len(strs))
except OverflowError:
length = _MAX_CHOICES
return self.with_special_case(lambda rand : strs[rand.randint(0, length)], weight=weight)
return self.with_special_case(lambda rand : strs[rand.randrange(0, length)], weight=weight)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found this during debugging. randrange is not inclusive on the high end, but randint is


def start(self, rand):
strs = self.base_strs
try:
length = int(len(strs))
except OverflowError:
length = _MAX_CHOICES
self._start(rand, lambda : strs[rand.randint(0, length)])
self._start(rand, lambda : strs[rand.randrange(0, length)])

_BYTE_MIN = -(1 << 7)
_BYTE_MAX = (1 << 7) - 1
Expand Down
22 changes: 22 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,28 @@ def test_substring_index(data_gen,delim):
f.substring_index(f.col('a'), delim, -1),
f.substring_index(f.col('a'), delim, -4)))

# ONLY LITERAL WIDTH AND PAD ARE SUPPORTED
def test_lpad():
gen = mk_str_gen('.{0,5}')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'LPAD(a, 2, " ")',
'LPAD(a, NULL, " ")',
'LPAD(a, 5, NULL)',
'LPAD(a, 5, "G")',
'LPAD(a, -1, "G")'))

# ONLY LITERAL WIDTH AND PAD ARE SUPPORTED
def test_rpad():
gen = mk_str_gen('.{0,5}')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'RPAD(a, 2, " ")',
'RPAD(a, NULL, " ")',
'RPAD(a, 5, NULL)',
'RPAD(a, 5, "G")',
'RPAD(a, -1, "G")'))

# ONLY LITERAL SEARCH PARAMS ARE SUPPORTED
def test_position():
gen = mk_str_gen('.{0,3}Z_Z.{0,3}A.{0,3}')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1282,6 +1282,50 @@ object GpuOverrides {
override def convertToGpu(child: Expression): GpuExpression = GpuLower(child)
})
.incompat(CASE_MODIFICATION_INCOMPAT),
expr[StringLPad](
"Pad a string on the left",
(in, conf, p, r) => new TernaryExprMeta[StringLPad](in, conf, p, r) {
override def tagExprForGpu(): Unit = {
if (!isLit(in.len)) {
willNotWorkOnGpu("only literal length is supported")
}

val padLit = extractLit(in.pad)
if (padLit.isEmpty) {
willNotWorkOnGpu("only literal pad is supported")
} else if (padLit.get.value != null &&
padLit.get.value.asInstanceOf[UTF8String].toString.length != 1) {
willNotWorkOnGpu("only a single character is supported for pad")
}
}
override def convertToGpu(
str: Expression,
width: Expression,
pad: Expression): GpuExpression =
GpuStringLPad(str, width, pad)
}),
expr[StringRPad](
"Pad a string on the right",
(in, conf, p, r) => new TernaryExprMeta[StringRPad](in, conf, p, r) {
override def tagExprForGpu(): Unit = {
if (!isLit(in.len)) {
willNotWorkOnGpu("only literal length is supported")
}

val padLit = extractLit(in.pad)
if (padLit.isEmpty) {
willNotWorkOnGpu("only literal pad is supported")
} else if (padLit.get.value != null &&
padLit.get.value.asInstanceOf[UTF8String].toString.length != 1) {
willNotWorkOnGpu("only a single character is supported for pad")
}
}
override def convertToGpu(
str: Expression,
width: Expression,
pad: Expression): GpuExpression =
GpuStringRPad(str, width, pad)
}),
expr[StringLocate](
"Substring search operator",
(in, conf, p, r) => new TernaryExprMeta[StringLocate](in, conf, p, r) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package org.apache.spark.sql.rapids

import scala.collection.mutable.ArrayBuffer

import ai.rapids.cudf.{ColumnVector, Scalar, Table}
import ai.rapids.cudf.{ColumnVector, DType, PadSide, Scalar, Table}
import com.nvidia.spark.rapids._
import com.nvidia.spark.rapids.RapidsPluginImplicits._

Expand Down Expand Up @@ -696,3 +696,76 @@ case class GpuSubstringIndex(strExpr: Expression,
"Internal Error: this version of substring index is not supported")
}

trait BasePad extends GpuTernaryExpression with ImplicitCastInputTypes with NullIntolerant {
val str: Expression
val len: Expression
val pad: Expression
val direction: PadSide

override def children: Seq[Expression] = str :: len :: pad :: Nil
override def dataType: DataType = StringType
override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, StringType)

override def doColumnar(str: GpuColumnVector, len: Scalar, pad: Scalar): GpuColumnVector = {
if (len.isValid && pad.isValid) {
val l = math.max(0, len.getInt)
withResource(str.getBase.pad(l, direction, pad.getJavaString)) { padded =>
GpuColumnVector.from(padded.substring(0, l))
}
} else {
withResource(Scalar.fromNull(DType.STRING)) { ns =>
GpuColumnVector.from(ColumnVector.fromScalar(ns, str.getRowCount.toInt))
}
}
}

override def doColumnar(
str: GpuColumnVector,
len: GpuColumnVector,
pad: GpuColumnVector): GpuColumnVector =
throw new IllegalStateException("This is not supported yet")

override def doColumnar(
str: Scalar,
len: GpuColumnVector,
pad: GpuColumnVector): GpuColumnVector =
throw new IllegalStateException("This is not supported yet")

override def doColumnar(str: Scalar, len: Scalar, pad: GpuColumnVector): GpuColumnVector =
throw new IllegalStateException("This is not supported yet")

override def doColumnar(str: Scalar, len: GpuColumnVector, pad: Scalar): GpuColumnVector =
throw new IllegalStateException("This is not supported yet")

override def doColumnar(
str: GpuColumnVector,
len: Scalar,
pad: GpuColumnVector): GpuColumnVector =
throw new IllegalStateException("This is not supported yet")

override def doColumnar(
str: GpuColumnVector,
len: GpuColumnVector,
pad: Scalar): GpuColumnVector =
throw new IllegalStateException("This is not supported yet")
}

case class GpuStringLPad(str: Expression, len: Expression, pad: Expression)
extends BasePad {
val direction = PadSide.LEFT
override def prettyName: String = "lpad"

def this(str: Expression, len: Expression) = {
this(str, len, GpuLiteral(" ", StringType))
}
}

case class GpuStringRPad(str: Expression, len: Expression, pad: Expression)
extends BasePad {
val direction = PadSide.RIGHT
override def prettyName: String = "rpad"

def this(str: Expression, len: Expression) = {
this(str, len, GpuLiteral(" ", StringType))
}
}