Skip to content

Commit

Permalink
Reimplement check for non-regexp strings using RegexParser (#4681)
Browse files Browse the repository at this point in the history
* update copyright year

* invert the logic and rename the method

* Update tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala

Co-authored-by: Nghia Truong <ttnghia@users.noreply.github.com>

* add empty string to unit test

* Empty signed commit

Signed-off-by: Andy Grove <andygrove@nvidia.com>

* treat any string containing escaped characters as a regular expression

Co-authored-by: Nghia Truong <ttnghia@users.noreply.github.com>
  • Loading branch information
andygrove and ttnghia authored Feb 4, 2022
1 parent a9c05d0 commit ba0ec9f
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -492,11 +492,6 @@ object GpuOverrides extends Logging {
listeners.clear()
}

def canRegexpBeTreatedLikeARegularString(strLit: UTF8String): Boolean = {
val s = strLit.toString
!regexList.exists(pattern => s.contains(pattern))
}

private def convertPartToGpuIfPossible(part: Partitioning, conf: RapidsConf): Partitioning = {
part match {
case _: GpuPartitioning => part
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,30 @@ class RegexParser(pattern: String) {

}

object RegexParser {
private val regexpChars = Set('\u0000', '\\', '.', '^', '$', '\f')

def isRegExpString(s: String): Boolean = {

def isRegExpString(ast: RegexAST): Boolean = ast match {
case RegexChar(ch) => regexpChars.contains(ch)
case RegexEscaped(_) => true
case RegexSequence(parts) => parts.exists(isRegExpString)
case _ => true
}

try {
val parser = new RegexParser(s)
val ast = parser.parse()
isRegExpString(ast)
} catch {
case _: RegexUnsupportedException =>
// if we cannot parse it then assume that it might be valid regexp
true
}
}
}

/**
* Transpile Java/Spark regular expression to a format that cuDF supports, or throw an exception
* if this is not possible.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1296,7 +1296,7 @@ class GpuStringSplitMeta(
} else {
val str = regexp.get.value.asInstanceOf[UTF8String]
if (str != null) {
if (!canRegexpBeTreatedLikeARegularString(str)) {
if (RegexParser.isRegExpString(str.toString)) {
willNotWorkOnGpu("regular expressions are not supported yet")
}
if (str.numChars() == 0) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,24 @@ import org.scalatest.FunSuite

class RegularExpressionParserSuite extends FunSuite {

test("detect regexp strings") {
// Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
val strings: Seq[String] = Seq("\\", "\u0000", "\\x00", "\\.",
"\f", "\\a", "\\e", "\\cx", "[abc]", "^", "[a-z&&[def]]", ".", "*", "\\d", "\\D",
"\\h", "\\H", "\\s", "\\S", "\\v", "\\V", "\\w", "\\w", "\\p", "$", "\\b", "\\B",
"\\A", "\\G", "\\Z", "\\z", "\\R", "?", "|", "(abc)", "a{1,}", "\\k", "\\Q", "\\E")
for (string <- strings) {
assert(RegexParser.isRegExpString(string))
}
}

test("detect non-regexp strings") {
val strings = Seq("A", ",", "\t", ":", "")
for (string <- strings) {
assert(!RegexParser.isRegExpString(string))
}
}

test("empty pattern") {
assert(parse("") === RegexSequence(ListBuffer()))
}
Expand Down

0 comments on commit ba0ec9f

Please sign in to comment.