Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Support for Lazy Quantifier [databricks] #10208

Merged
merged 11 commits into from
Jan 24, 2024
2 changes: 1 addition & 1 deletion docs/compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ The following regular expression patterns are not yet supported on the GPU and w
or more results
- Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D`
- Line and string anchors are not supported by `string_split` and `str_to_map`
- Lazy quantifiers, such as `a*?`
- Lazy quantifiers within a choice block such as `(2|\u2029??)+`
- Possessive quantifiers, such as `a*+`
- Character classes that use union, intersection, or subtraction semantics, such as `[a-d[m-p]]`, `[a-z&&[def]]`,
or `[a-z&&[^bc]]`
Expand Down
11 changes: 10 additions & 1 deletion integration_tests/src/main/python/regexp_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -1064,3 +1064,12 @@ def test_re_replace_all():
lambda spark: unary_op_df(spark, gen).selectExpr(
'REGEXP_REPLACE(a, ".*$", "PROD", 1)'),
conf=_regexp_conf)

def test_lazy_quantifier():
gen = mk_str_gen('[a-z]{0,2} \"[a-z]{0,2}\" and \"[a-z]{0,3}\"')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'a', r'REGEXP_EXTRACT(a, "(\".??\")")',
r'REGEXP_EXTRACT(a, "(\".+?\")")',
r'REGEXP_EXTRACT(a, "(\".*?\")")'),
conf=_regexp_conf)
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -1516,9 +1516,8 @@ class CudfRegexTranspiler(mode: RegexMode) {
case (RegexRepetition(_, SimpleQuantifier('*')), SimpleQuantifier('+')) =>
throw new RegexUnsupportedException("Possessive quantifier *+ not supported",
quantifier.position)
case (RegexRepetition(_, SimpleQuantifier('*')), SimpleQuantifier('?')) =>
throw new RegexUnsupportedException("Lazy quantifier *? not supported",
quantifier.position)
case (RegexRepetition(_, SimpleQuantifier('?' | '*' | '+')), SimpleQuantifier('?')) =>
RegexRepetition(rewrite(base, replacement, None, flags), quantifier)
case _ =>
throw new RegexUnsupportedException("Preceding token cannot be quantified",
quantifier.position)
Expand Down Expand Up @@ -1566,6 +1565,20 @@ class CudfRegexTranspiler(mode: RegexMode) {
r.position)
}

(ll, rr) match {
// ll = lazyQuantifier inside a choice
case (RegexSequence(ListBuffer(RegexRepetition(
RegexRepetition(_, SimpleQuantifier('?')), SimpleQuantifier('?')))), _) |
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: indent seems off here? I would expect this second line to be indented more that the case in the prev line. Same comment for other uses of this pattern in this PR.

// rr = lazyQuantifier inside a choice
(_, RegexSequence(ListBuffer(RegexRepetition(
RegexRepetition(_, SimpleQuantifier('?')), SimpleQuantifier('?'))))) =>
throw new RegexUnsupportedException(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"cuDF does not support lazy quantifier inside choice", r.position)
case (_, RegexChoice(RegexSequence(_), RegexSequence(ListBuffer(RegexRepetition(
RegexEscaped('A'), SimpleQuantifier('?')), _)))) =>
throw new RegexUnsupportedException("Invalid regex pattern at position", r.position)
case _ =>
}
RegexChoice(ll, rr)

case g @ RegexGroup(_, _, Some(lookahead)) =>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -98,7 +98,7 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite {

test("choice with repetition - regexp_find") {
val patterns = Seq("b?|a", "b*|^\t", "b+|^\t", "a|b+", "a+|b+", "a{2,3}|b+", "a*|b+",
"[cat]{3}|dog")
"b*?|^\t", "b+?|^\t", "a|b+?", "a+?|b+?", "a{2,3}|b+?", "a*?|b+?", "[cat]{3}|dog")
assertCpuGpuMatchesRegexpFind(patterns, Seq("aaa", "bb", "a\tb", "aaaabbbb", "a\tb\ta\tb"))
}

Expand Down Expand Up @@ -770,6 +770,14 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite {
}
}

test("regexp_split - character class repetition - ? and * with reluctant quantifier") {
val patterns = Set(raw"[a-z][0-9]??", raw"[a-z][0-9]*?")
val data = Seq("a", "aa", "a1a1", "a1b2", "a1b")
for (limit <- Seq(Integer.MIN_VALUE, -2, -1)) {
doStringSplitTest(patterns, data, limit)
}
}

test("regexp_split - repetition with {0,n}, or {0,}") {
// see https://github.com/NVIDIA/spark-rapids/issues/6958
val patterns = Set("ba{0,}", raw"a\02{0,}", "ba{0,2}", raw"b\02{0,10}")
Expand Down
Loading