Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for word boundaries \b and \B #5479

Merged
14 changes: 14 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,20 @@ def test_regexp_extract_idx_0():
'regexp_extract(a, "^([a-d]*)[0-9]*([a-d]*)\\z", 0)'),
conf=_regexp_conf)

def test_word_boundaries():
gen = mk_str_gen('([abc]{1,3}[\r\n\t \f]{0,2}[123]){1,5}')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: debug_df(unary_op_df(spark, gen).selectExpr('a',
'rlike(a, "\\\\b")',
'rlike(a, "\\\\B")',
'rlike(a, "\\\\b\\\\B")',
'regexp_extract(a, "([a-d]+)\\\\b([e-h]+)", 1)',
'regexp_extract(a, "([a-d]+)\\\\B", 1)',
'regexp_replace(a, "\\\\b", "")',
'regexp_replace(a, "\\\\B", "")',
)),
conf=_regexp_conf)

def test_regexp_hexadecimal_digits():
gen = mk_str_gen(
'[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -643,9 +643,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
case 'W' =>
// see https://github.com/NVIDIA/spark-rapids/issues/4475
throw new RegexUnsupportedException("non-word class \\W is not supported")
case 'b' | 'B' =>
// see https://github.com/NVIDIA/spark-rapids/issues/4517
throw new RegexUnsupportedException("word boundaries are not supported")
case 'b' | 'B' if mode == RegexSplitMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/5478
throw new RegexUnsupportedException("word boundaries are not supported in split mode")
case 'A' if mode == RegexSplitMode =>
throw new RegexUnsupportedException("string anchor \\A is not supported in split mode")
case 'Z' if mode == RegexSplitMode || mode == RegexReplaceMode =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
assertCpuGpuMatchesRegexpFind(patterns, Seq("", "\u0007", "a\u0007b",
"\u0007\u003f\u007f", "\u0080", "a\u00fe\u00ffb", "ab\ueeeecd"))
}

test("string anchors - find") {
val patterns = Seq("\\Atest", "\\A+test", "\\A{1}test", "\\A{1,}test",
"(\\A)+test", "(\\A){1}test", "(\\A){1,}test", "test\\z")
Expand Down Expand Up @@ -245,6 +245,13 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
assertCpuGpuMatchesRegexpFind(patterns, inputs)
}

test ("word boundaries will fall back to CPU - split") {
val patterns = Seq("\\b", "\\B")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexSplitMode, "word boundaries are not supported in split mode")
)
}

test("whitespace boundaries - replace") {
assertCpuGpuMatchesRegexpReplace(
Seq("\\s", "\\S"),
Expand Down