Add support for word boundaries \b and \B (#5479)

* Enable \b and \B Signed-off-by: Anthony Chang <antchang@nvidia.com> * Add unit test for the fallback to CPU in split mode Signed-off-by: Anthony Chang <antchang@nvidia.com> * Use double backslack for integration tests Signed-off-by: Anthony Chang <antchang@nvidia.com> * Fix integration tests generating unwanted chars in input Signed-off-by: Anthony Chang <antchang@nvidia.com> * [WIP] save Signed-off-by: Anthony Chang <antchang@nvidia.com> * Fall back to CPU for $\B Signed-off-by: Anthony Chang <antchang@nvidia.com> * Also fallback for $\b Signed-off-by: Anthony Chang <antchang@nvidia.com> * Fix build Signed-off-by: Anthony Chang <antchang@nvidia.com>
NVIDIA · Jun 6, 2022 · 9334d01 · 9334d01
1 parent a561744
commit 9334d01
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 3 deletions.
diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
@@ -872,6 +872,20 @@ def test_regexp_extract_idx_0():
                 'regexp_extract(a, "^([a-d]*)[0-9]*([a-d]*)\\z", 0)'),
         conf=_regexp_conf)
 
+def test_word_boundaries():
+    gen = StringGen('([abc]{1,3}[\r\n\t \f]{0,2}[123]){1,5}')
+    assert_gpu_and_cpu_are_equal_collect(
+            lambda spark: unary_op_df(spark, gen).selectExpr(
+                'rlike(a, "\\\\b")',
+                'rlike(a, "\\\\B")',
+                'rlike(a, "\\\\b\\\\B")',
+                'regexp_extract(a, "([a-d]+)\\\\b([e-h]+)", 1)',
+                'regexp_extract(a, "([a-d]+)\\\\B", 1)',
+                'regexp_replace(a, "\\\\b", "#")',
+                'regexp_replace(a, "\\\\B", "#")',
+            ),
+        conf=_regexp_conf)
+
 def test_character_classes():
     gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}[ \n\t\r]{0,2}')
     assert_gpu_and_cpu_are_equal_collect(

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -889,6 +889,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
                 RegexRepetition(lineTerminatorMatcher(Set(ch), true,
                     mode == RegexReplaceMode), SimpleQuantifier('?')),
                 RegexChar('$')))
+            case Some(RegexEscaped('b')) | Some(RegexEscaped('B')) =>
+              throw new RegexUnsupportedException(
+                      "regex sequences with \\b or \\B not supported around $")
             case _ =>
               // otherwise by default we can match any or none the full set of line terminators
               if (mode == RegexReplaceMode) {
@@ -962,9 +965,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
           } else {
             RegexCharacterClass(negated = false, components)
           }
-        case 'b' | 'B' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4517
-          throw new RegexUnsupportedException("word boundaries are not supported")
+        case 'b' | 'B' if mode == RegexSplitMode =>
+          // see https://github.com/NVIDIA/spark-rapids/issues/5478
+          throw new RegexUnsupportedException("word boundaries are not supported in split mode")
         case 'A' if mode == RegexSplitMode =>
           throw new RegexUnsupportedException("string anchor \\A is not supported in split mode")
         case 'Z' if mode == RegexSplitMode =>
@@ -1144,6 +1147,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
                           RegexRepetition(lineTerminatorMatcher(Set(ch), true, false),
                             SimpleQuantifier('?')), RegexChar('$')))))
                     popBackrefIfNecessary(false)
+                  case RegexEscaped('b') | RegexEscaped('B') =>
+                    throw new RegexUnsupportedException(
+                      "regex sequences with \\b or \\B not supported around $")
                   case _ =>
                     r.append(rewrite(part, replacement, last))
                 }

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -297,6 +297,13 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     }
   }
 
+  test ("word boundaries will fall back to CPU - split") {
+    val patterns = Seq("\\b", "\\B")
+    patterns.foreach(pattern =>
+      assertUnsupported(pattern, RegexSplitMode, "word boundaries are not supported in split mode")
+    )
+  }
+
   test("whitespace boundaries - replace") {
     assertCpuGpuMatchesRegexpReplace(
       Seq("\\s", "\\S"),