Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for null characters in regular expressions #5834

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,15 +1076,6 @@ def test_rlike_null_pattern():
lambda spark: unary_op_df(spark, gen).selectExpr(
'a rlike NULL'))

@allow_non_gpu('ProjectExec', 'RLike')
def test_rlike_fallback_null_pattern():
gen = mk_str_gen('[abcd]{1,3}')
assert_gpu_fallback_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'a rlike "a\u0000"'),
'RLike',
conf=_regexp_conf)

@allow_non_gpu('ProjectExec', 'RLike')
def test_rlike_fallback_empty_group():
gen = mk_str_gen('[abcd]{1,3}')
Expand Down
57 changes: 39 additions & 18 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ class RegexParser(pattern: String) {
case '\\' =>
parseEscapedCharacter()
case '\u0000' =>
throw new RegexUnsupportedException(
"cuDF does not support null characters in regular expressions", Some(pos))
RegexGroup(false, RegexEscaped('0'))
case '*' | '+' | '?' =>
throw new RegexUnsupportedException(
"base expression cannot start with quantifier", Some(pos))
Expand Down Expand Up @@ -225,7 +224,7 @@ class RegexParser(pattern: String) {
characterClass.negated = true
case '\u0000' =>
throw new RegexUnsupportedException(
"cuDF does not support null characters in regular expressions", Some(pos))
"cuDF does not support null characters in character classes", Some(pos))
case ch =>
val nextChar: RegexCharacterClassComponent = ch match {
case '\\' =>
Expand All @@ -234,6 +233,9 @@ class RegexParser(pattern: String) {
// A hex or octal representation of a meta character gets treated as an escaped
// char. Example: [\x5ea] is treated as [\^a], not just [^a]
RegexEscaped(ch)
case RegexChar('\u0000') =>
throw new RegexUnsupportedException(
"cuDF does not support null characters in character classes", Some(pos))
case other => other
}
case '&' =>
Expand Down Expand Up @@ -495,9 +497,6 @@ class RegexParser(pattern: String) {
val value = Integer.parseInt(hexDigit, 16)
if (value < Character.MIN_CODE_POINT || value > Character.MAX_CODE_POINT) {
throw new RegexUnsupportedException(s"Invalid hex digit: $hexDigit")
} else if (value == 0) {
throw new RegexUnsupportedException(s"cuDF does not support null characters " +
s"in regular expressions", Some(pos))
}

RegexHexDigit(hexDigit)
Expand Down Expand Up @@ -621,6 +620,7 @@ object RegexParser {
case '\n' => "\\n"
case '\t' => "\\t"
case '\f' => "\\f"
case '\u0000' => "\\u0000"
case '\u000b' => "\\u000b"
case '\u0085' => "\\u0085"
case '\u2028' => "\\u2028"
Expand Down Expand Up @@ -1327,23 +1327,18 @@ class CudfRegexTranspiler(mode: RegexMode) {

// cuDF does not support terms ending with line anchors on one side
// of a choice, such as "^|$"
def endsWithLineAnchor(e: RegexAST): Boolean = {
e match {
case RegexSequence(parts) if parts.nonEmpty =>
val j = parts.lastIndexWhere {
case RegexEmpty() => false
case _ => true
}
endsWithLineAnchor(parts(j))
case RegexEscaped('A') => true
case _ => isBeginOrEndLineAnchor(e)
}
}
if (endsWithLineAnchor(ll) || endsWithLineAnchor(rr)) {
throw new RegexUnsupportedException(
"cuDF does not support terms ending with line anchors on one side of a choice")
}

// cuDF does not support terms ending with word boundaries on one side
// of a choice, such as "\\b|a"
if (endsWithWordBoundary(ll) || endsWithWordBoundary(rr)) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in this PR because the updated fuzz tests caught this case

throw new RegexUnsupportedException(
"cuDF does not support terms ending with word boundaries on one side of a choice")
}

RegexChoice(ll, rr)

case RegexGroup(capture, term) =>
Expand Down Expand Up @@ -1379,6 +1374,32 @@ class CudfRegexTranspiler(mode: RegexMode) {
}
}

private def endsWith(regex: RegexAST, f: RegexAST => Boolean): Boolean = {
regex match {
case RegexSequence(parts) if parts.nonEmpty =>
val j = parts.lastIndexWhere {
case RegexEmpty() => false
case _ => true
}
endsWith(parts(j), f)
case _ => f(regex)
}
}

private def endsWithLineAnchor(e: RegexAST): Boolean = {
endsWith(e, {
case RegexEscaped('A') => true
case other => isBeginOrEndLineAnchor(other)
})
}

private def endsWithWordBoundary(e: RegexAST): Boolean = {
endsWith(e, {
case RegexEscaped(a) if "bB".contains(a) => true
case _ => false
})
}

private def isBeginOrEndLineAnchor(regex: RegexAST): Boolean = regex match {
case RegexSequence(parts) => parts.nonEmpty && parts.forall(isBeginOrEndLineAnchor)
case RegexGroup(_, term) => isBeginOrEndLineAnchor(term)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,12 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
})
}

test("cuDF does not support null in pattern") {
val patterns = Seq("\u0000", "a\u0000b", "a(\u0000)b", "a[a-b][\u0000]")
patterns.foreach(pattern =>
test("cuDF does not support null in character classes") {
val patterns = Seq(raw"[\00]", "[a\u0000 b]", raw"[\x00]", raw"[\x{0000}]")
patterns.foreach(pattern => {
assertUnsupported(pattern, RegexFindMode,
"cuDF does not support null characters in regular expressions"))
"cuDF does not support null characters in character classes")
})
}

test("cuDF does not support class intersection &&") {
Expand Down Expand Up @@ -434,7 +435,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

private val REGEXP_LIMITED_CHARS_COMMON = "|()[]{},-./;:!^$#%&*+?<=>@\"'~`" +
"abc123x\\ \t\r\n\f\u000bBsdwSDWzZ"
"abc0123x\\ \t\r\n\f\u000b\u0000BsdwSDWzZ"

private val REGEXP_LIMITED_CHARS_FIND = REGEXP_LIMITED_CHARS_COMMON

Expand Down