Skip to content

Commit

Permalink
Add support for word boundaries \b and \B (#5479)
Browse files Browse the repository at this point in the history
* Enable \b and \B

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Add unit test for the fallback to CPU in split mode

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Use double backslack for integration tests

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Fix integration tests generating unwanted chars in input

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* [WIP] save

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Fall back to CPU for $\B

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Also fallback for $\b

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Fix build

Signed-off-by: Anthony Chang <antchang@nvidia.com>
  • Loading branch information
anthony-chang authored Jun 6, 2022
1 parent a561744 commit 9334d01
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 3 deletions.
14 changes: 14 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,20 @@ def test_regexp_extract_idx_0():
'regexp_extract(a, "^([a-d]*)[0-9]*([a-d]*)\\z", 0)'),
conf=_regexp_conf)

def test_word_boundaries():
gen = StringGen('([abc]{1,3}[\r\n\t \f]{0,2}[123]){1,5}')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'rlike(a, "\\\\b")',
'rlike(a, "\\\\B")',
'rlike(a, "\\\\b\\\\B")',
'regexp_extract(a, "([a-d]+)\\\\b([e-h]+)", 1)',
'regexp_extract(a, "([a-d]+)\\\\B", 1)',
'regexp_replace(a, "\\\\b", "#")',
'regexp_replace(a, "\\\\B", "#")',
),
conf=_regexp_conf)

def test_character_classes():
gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}[ \n\t\r]{0,2}')
assert_gpu_and_cpu_are_equal_collect(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
RegexRepetition(lineTerminatorMatcher(Set(ch), true,
mode == RegexReplaceMode), SimpleQuantifier('?')),
RegexChar('$')))
case Some(RegexEscaped('b')) | Some(RegexEscaped('B')) =>
throw new RegexUnsupportedException(
"regex sequences with \\b or \\B not supported around $")
case _ =>
// otherwise by default we can match any or none the full set of line terminators
if (mode == RegexReplaceMode) {
Expand Down Expand Up @@ -962,9 +965,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
} else {
RegexCharacterClass(negated = false, components)
}
case 'b' | 'B' =>
// see https://github.com/NVIDIA/spark-rapids/issues/4517
throw new RegexUnsupportedException("word boundaries are not supported")
case 'b' | 'B' if mode == RegexSplitMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/5478
throw new RegexUnsupportedException("word boundaries are not supported in split mode")
case 'A' if mode == RegexSplitMode =>
throw new RegexUnsupportedException("string anchor \\A is not supported in split mode")
case 'Z' if mode == RegexSplitMode =>
Expand Down Expand Up @@ -1144,6 +1147,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
RegexRepetition(lineTerminatorMatcher(Set(ch), true, false),
SimpleQuantifier('?')), RegexChar('$')))))
popBackrefIfNecessary(false)
case RegexEscaped('b') | RegexEscaped('B') =>
throw new RegexUnsupportedException(
"regex sequences with \\b or \\B not supported around $")
case _ =>
r.append(rewrite(part, replacement, last))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,13 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}
}

test ("word boundaries will fall back to CPU - split") {
val patterns = Seq("\\b", "\\B")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexSplitMode, "word boundaries are not supported in split mode")
)
}

test("whitespace boundaries - replace") {
assertCpuGpuMatchesRegexpReplace(
Seq("\\s", "\\S"),
Expand Down

0 comments on commit 9334d01

Please sign in to comment.