Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for null characters in regular expressions #5834

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 41 additions & 18 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ class RegexParser(pattern: String) {
case '\\' =>
parseEscapedCharacter()
case '\u0000' =>
throw new RegexUnsupportedException(
"cuDF does not support null characters in regular expressions", Some(pos))
RegexGroup(false, RegexEscaped('0'))
case '*' | '+' | '?' =>
throw new RegexUnsupportedException(
"base expression cannot start with quantifier", Some(pos))
Expand Down Expand Up @@ -225,7 +224,7 @@ class RegexParser(pattern: String) {
characterClass.negated = true
case '\u0000' =>
throw new RegexUnsupportedException(
"cuDF does not support null characters in regular expressions", Some(pos))
"cuDF does not support null characters in character classes", Some(pos))
case ch =>
val nextChar: RegexCharacterClassComponent = ch match {
case '\\' =>
Expand All @@ -234,6 +233,9 @@ class RegexParser(pattern: String) {
// A hex or octal representation of a meta character gets treated as an escaped
// char. Example: [\x5ea] is treated as [\^a], not just [^a]
RegexEscaped(ch)
case RegexChar('\u0000') =>
throw new RegexUnsupportedException(
"cuDF does not support null characters in character classes", Some(pos))
case other => other
}
case '&' =>
Expand Down Expand Up @@ -495,9 +497,6 @@ class RegexParser(pattern: String) {
val value = Integer.parseInt(hexDigit, 16)
if (value < Character.MIN_CODE_POINT || value > Character.MAX_CODE_POINT) {
throw new RegexUnsupportedException(s"Invalid hex digit: $hexDigit")
} else if (value == 0) {
throw new RegexUnsupportedException(s"cuDF does not support null characters " +
s"in regular expressions", Some(pos))
}

RegexHexDigit(hexDigit)
Expand Down Expand Up @@ -621,6 +620,7 @@ object RegexParser {
case '\n' => "\\n"
case '\t' => "\\t"
case '\f' => "\\f"
case '\u0000' => "\\u0000"
case '\u000b' => "\\u000b"
case '\u0085' => "\\u0085"
case '\u2028' => "\\u2028"
Expand Down Expand Up @@ -1327,23 +1327,18 @@ class CudfRegexTranspiler(mode: RegexMode) {

// cuDF does not support terms ending with line anchors on one side
// of a choice, such as "^|$"
def endsWithLineAnchor(e: RegexAST): Boolean = {
e match {
case RegexSequence(parts) if parts.nonEmpty =>
val j = parts.lastIndexWhere {
case RegexEmpty() => false
case _ => true
}
endsWithLineAnchor(parts(j))
case RegexEscaped('A') => true
case _ => isBeginOrEndLineAnchor(e)
}
}
if (endsWithLineAnchor(ll) || endsWithLineAnchor(rr)) {
throw new RegexUnsupportedException(
"cuDF does not support terms ending with line anchors on one side of a choice")
}

// cuDF does not support terms ending with word boundaries on one side
// of a choice, such as "\\b|a"
if (endsWithWordBoundary(ll) || endsWithWordBoundary(rr)) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in this PR because the updated fuzz tests caught this case

throw new RegexUnsupportedException(
"cuDF does not support terms ending with word boundaries on one side of a choice")
}

RegexChoice(ll, rr)

case RegexGroup(capture, term) =>
Expand Down Expand Up @@ -1379,6 +1374,32 @@ class CudfRegexTranspiler(mode: RegexMode) {
}
}

private def endsWith(regex: RegexAST, f: RegexAST => Boolean): Boolean = {
regex match {
case RegexSequence(parts) if parts.nonEmpty =>
val j = parts.lastIndexWhere {
case RegexEmpty() => false
case _ => true
}
endsWith(parts(j), f)
case _ => f(regex)
}
}

private def endsWithLineAnchor(e: RegexAST): Boolean = {
endsWith(e, {
case RegexEscaped('A') => true
case other => isBeginOrEndLineAnchor(other)
})
}

private def endsWithWordBoundary(e: RegexAST): Boolean = {
endsWith(e, {
case RegexEscaped(a) if "bB".contains(a) => true
case _ => false
})
}

private def isBeginOrEndLineAnchor(regex: RegexAST): Boolean = regex match {
case RegexSequence(parts) => parts.nonEmpty && parts.forall(isBeginOrEndLineAnchor)
case RegexGroup(_, term) => isBeginOrEndLineAnchor(term)
Expand Down Expand Up @@ -1464,6 +1485,8 @@ sealed case class RegexHexDigit(a: String) extends RegexCharacterClassComponent
override def toRegexString: String = {
if (a.length == 2) {
s"\\x$a"
} else if (a == "0") {
s"\\0"
} else {
s"\\x{$a}"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
})
}

test("cuDF does not support null in pattern") {
val patterns = Seq("\u0000", "a\u0000b", "a(\u0000)b", "a[a-b][\u0000]")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexFindMode,
"cuDF does not support null characters in regular expressions"))
}

test("cuDF does not support class intersection &&") {
val patterns = Seq("[a&&b]", "[&&1]")
patterns.foreach(pattern =>
Expand Down Expand Up @@ -434,7 +427,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

private val REGEXP_LIMITED_CHARS_COMMON = "|()[]{},-./;:!^$#%&*+?<=>@\"'~`" +
"abc123x\\ \t\r\n\f\u000bBsdwSDWzZ"
"abc0123x\\ \t\r\n\f\u000b\u0000BsdwSDWzZ"

private val REGEXP_LIMITED_CHARS_FIND = REGEXP_LIMITED_CHARS_COMMON

Expand Down