NVIDIA · razajafri · Jan 24, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -491,7 +491,7 @@ The following regular expression patterns are not yet supported on the GPU and w
   or more results
 - Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D`
 - Line and string anchors are not supported by `string_split` and `str_to_map`
-- Lazy quantifiers, such as `a*?`
+- Lazy quantifiers within a choice block such as `(2|\u2029??)+` 
 - Possessive quantifiers, such as `a*+`
 - Character classes that use union, intersection, or subtraction semantics, such as `[a-d[m-p]]`, `[a-z&&[def]]`,
   or `[a-z&&[^bc]]`

diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1064,3 +1064,12 @@ def test_re_replace_all():
         lambda spark: unary_op_df(spark, gen).selectExpr(
             'REGEXP_REPLACE(a, ".*$", "PROD", 1)'),
         conf=_regexp_conf)
+
+def test_lazy_quantifier():
+    gen = mk_str_gen('[a-z]{0,2} \"[a-z]{0,2}\" and \"[a-z]{0,3}\"')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'a', r'REGEXP_EXTRACT(a, "(\".??\")")',
+            r'REGEXP_EXTRACT(a, "(\".+?\")")',
+            r'REGEXP_EXTRACT(a, "(\".*?\")")'),
+        conf=_regexp_conf)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1516,9 +1516,8 @@ class CudfRegexTranspiler(mode: RegexMode) {
         case (RegexRepetition(_, SimpleQuantifier('*')), SimpleQuantifier('+')) =>
           throw new RegexUnsupportedException("Possessive quantifier *+ not supported",
             quantifier.position)
-        case (RegexRepetition(_, SimpleQuantifier('*')), SimpleQuantifier('?')) =>
-          throw new RegexUnsupportedException("Lazy quantifier *? not supported",
-            quantifier.position)
+        case (RegexRepetition(_, SimpleQuantifier('?' | '*' | '+')), SimpleQuantifier('?')) =>
+          RegexRepetition(rewrite(base, replacement, None, flags), quantifier)
         case _ =>
           throw new RegexUnsupportedException("Preceding token cannot be quantified",
             quantifier.position)
@@ -1566,6 +1565,20 @@ class CudfRegexTranspiler(mode: RegexMode) {
             r.position)
         }
 
+        (ll, rr) match {
+          // ll = lazyQuantifier inside a choice
+          case (RegexSequence(ListBuffer(RegexRepetition(
+          RegexRepetition(_, SimpleQuantifier('?')), SimpleQuantifier('?')))), _) |
+               // rr = lazyQuantifier inside a choice
+               (_, RegexSequence(ListBuffer(RegexRepetition(
+               RegexRepetition(_, SimpleQuantifier('?')), SimpleQuantifier('?'))))) =>
+            throw new RegexUnsupportedException(
+              "cuDF does not support lazy quantifier inside choice", r.position)
+          case (_, RegexChoice(RegexSequence(_), RegexSequence(ListBuffer(RegexRepetition(
+          RegexEscaped('A'), SimpleQuantifier('?')), _)))) =>
+            throw new RegexUnsupportedException("Invalid regex pattern at position", r.position)
+          case _ =>
+        }
         RegexChoice(ll, rr)
 
       case g @ RegexGroup(_, _, Some(lookahead)) =>

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,7 +98,7 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite {
 
   test("choice with repetition - regexp_find") {
     val patterns = Seq("b?|a", "b*|^\t", "b+|^\t", "a|b+", "a+|b+", "a{2,3}|b+", "a*|b+",
-        "[cat]{3}|dog")
+      "b*?|^\t", "b+?|^\t", "a|b+?", "a+?|b+?", "a{2,3}|b+?", "a*?|b+?", "[cat]{3}|dog")
     assertCpuGpuMatchesRegexpFind(patterns, Seq("aaa", "bb", "a\tb", "aaaabbbb", "a\tb\ta\tb"))
   }
 
@@ -770,6 +770,14 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite {
     }
   }
 
+  test("regexp_split - character class repetition - ? and * with reluctant quantifier") {
+    val patterns = Set(raw"[a-z][0-9]??", raw"[a-z][0-9]*?")
+    val data = Seq("a", "aa", "a1a1", "a1b2", "a1b")
+    for (limit <- Seq(Integer.MIN_VALUE, -2, -1)) {
+      doStringSplitTest(patterns, data, limit)
+    }
+  }
+
   test("regexp_split - repetition with {0,n}, or {0,}") {
     // see https://github.com/NVIDIA/spark-rapids/issues/6958
     val patterns = Set("ba{0,}", raw"a\02{0,}", "ba{0,2}", raw"b\02{0,10}")