Skip to content

Commit

Permalink
Fall back to CPU for regular expressions containing hex digits (#4492)
Browse files Browse the repository at this point in the history
* Fall back to CPU for regular expressions containing hex digits

Signed-off-by: Andy Grove <andygrove@nvidia.com>

* assert that hex and octal digits fall back to CPU

Signed-off-by: Andy Grove <andygrove@nvidia.com>

* add newline between tests
  • Loading branch information
andygrove authored Jan 12, 2022
1 parent 6191989 commit 2e29f3c
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 14 deletions.
1 change: 1 addition & 0 deletions docs/compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ Here are some examples of regular expression patterns that are not supported on
- Beginning-of-line and end-of-line anchors (`^` and `$`) are not supported in some contexts, such as when combined
with a choice (`^|a`).
- String anchors `\z` and `\Z` are not supported by `regexp_replace`
- Hex and octal digits

In addition to these cases that can be detected, there are also known issues that can cause incorrect results:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -456,13 +456,14 @@ class CudfRegexTranspiler(replace: Boolean) {
}

case RegexOctalChar(_) =>
// cuDF produced different results compared to Spark in some cases
// example: "a\141|.$"
// see https://github.com/NVIDIA/spark-rapids/issues/4288
throw new RegexUnsupportedException(
s"cuDF does not support octal digits consistently with Spark")

case RegexHexDigit(_) =>
regex
// see https://github.com/NVIDIA/spark-rapids/issues/4486
throw new RegexUnsupportedException(
s"cuDF does not support hex digits consistently with Spark")

case RegexEscaped(ch) => ch match {
case 'b' | 'B' =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,22 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
"cuDF does not support null characters in regular expressions"))
}

test("cuDF does not support hex digits consistently with Spark") {
// see https://github.com/NVIDIA/spark-rapids/issues/4486
val patterns = Seq(raw"\xA9", raw"\x00A9", raw"\x10FFFF")
patterns.foreach(pattern =>
assertUnsupported(pattern, replace = false,
"cuDF does not support hex digits consistently with Spark"))
}

test("cuDF does not support octal digits consistently with Spark") {
// see https://github.com/NVIDIA/spark-rapids/issues/4288
val patterns = Seq(raw"\07", raw"\077", raw"\0377")
patterns.foreach(pattern =>
assertUnsupported(pattern, replace = false,
"cuDF does not support octal digits consistently with Spark"))
}

test("string anchors - find") {
val patterns = Seq("\\Atest", "test\\z", "test\\Z")
assertCpuGpuMatchesRegexpFind(patterns, Seq("", "test", "atest", "testa",
Expand Down Expand Up @@ -159,11 +175,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
assertCpuGpuMatchesRegexpFind(Seq(pattern), Seq("1\r2", "1\n2", "1\r\n2"))
}

ignore("known issue - octal digit") {
val pattern = "a\\141|.$" // using hex works fine e.g. "a\\x61|.$"
assertCpuGpuMatchesRegexpFind(Seq(pattern), Seq("] b["))
}

test("character class with ranges") {
val patterns = Seq("[a-b]", "[a-zA-Z]")
patterns.foreach(parse)
Expand Down Expand Up @@ -240,12 +251,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
assertCpuGpuMatchesRegexpFind(patterns, inputs)
}

test("compare CPU and GPU: hex") {
val patterns = Seq(raw"\x61")
val inputs = Seq("a", "b")
assertCpuGpuMatchesRegexpFind(patterns, inputs)
}

test("compare CPU and GPU: octal") {
val patterns = Seq("\\\\141")
val inputs = Seq("a", "b")
Expand Down

0 comments on commit 2e29f3c

Please sign in to comment.