NVIDIA · anthony-chang · Jun 6, 2022 · May 12, 2022 · May 12, 2022 · May 16, 2022
diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
@@ -836,6 +836,20 @@ def test_regexp_extract_idx_0():
                 'regexp_extract(a, "^([a-d]*)[0-9]*([a-d]*)\\z", 0)'),
         conf=_regexp_conf)
 
+def test_word_boundaries():
+    gen = mk_str_gen('([abc]{1,3}[\r\n\t \f]{0,2}[123]){1,5}')
+    assert_gpu_and_cpu_are_equal_collect(
+            lambda spark: debug_df(unary_op_df(spark, gen).selectExpr('a', 
+                'rlike(a, "\\\\b")',
+                'rlike(a, "\\\\B")',
+                'rlike(a, "\\\\b\\\\B")',
+                'regexp_extract(a, "([a-d]+)\\\\b([e-h]+)", 1)',
+                'regexp_extract(a, "([a-d]+)\\\\B", 1)',
+                'regexp_replace(a, "\\\\b", "")',
+                'regexp_replace(a, "\\\\B", "")',
+            )),
+        conf=_regexp_conf)
+
 def test_regexp_hexadecimal_digits():
     gen = mk_str_gen(
         '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]')

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -643,9 +643,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
         case 'W' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4475
           throw new RegexUnsupportedException("non-word class \\W is not supported")
-        case 'b' | 'B' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4517
-          throw new RegexUnsupportedException("word boundaries are not supported")
+        case 'b' | 'B' if mode == RegexSplitMode =>
+          // see https://github.com/NVIDIA/spark-rapids/issues/5478
+          throw new RegexUnsupportedException("word boundaries are not supported in split mode")
         case 'A' if mode == RegexSplitMode =>
           throw new RegexUnsupportedException("string anchor \\A is not supported in split mode")
         case 'Z' if mode == RegexSplitMode || mode == RegexReplaceMode =>

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -175,7 +175,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     assertCpuGpuMatchesRegexpFind(patterns, Seq("", "\u0007", "a\u0007b", 
         "\u0007\u003f\u007f", "\u0080", "a\u00fe\u00ffb", "ab\ueeeecd"))
   }
-  
+
   test("string anchors - find") {
     val patterns = Seq("\\Atest", "\\A+test", "\\A{1}test", "\\A{1,}test",
         "(\\A)+test", "(\\A){1}test", "(\\A){1,}test", "test\\z")
@@ -245,6 +245,13 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     assertCpuGpuMatchesRegexpFind(patterns, inputs)
   }
 
+  test ("word boundaries will fall back to CPU - split") {
+    val patterns = Seq("\\b", "\\B")
+    patterns.foreach(pattern =>
+      assertUnsupported(pattern, RegexSplitMode, "word boundaries are not supported in split mode")
+    )
+  }
+
   test("whitespace boundaries - replace") {
     assertCpuGpuMatchesRegexpReplace(
       Seq("\\s", "\\S"),