Skip to content

Commit

Permalink
Handle escaping the dangling right ] and right } in the regexp transp…
Browse files Browse the repository at this point in the history
…iler (#9239)

* Handle escaping the dangling right ] and right } automatically in the transpiler to ensure compatibility with cudf

Signed-off-by: Navin Kumar <navink@nvidia.com>

* Fix a syntax error in pytest that snuck in

Signed-off-by: Navin Kumar <navink@nvidia.com>

* fix scalatest failures

Signed-off-by: Navin Kumar <navink@nvidia.com>

---------

Signed-off-by: Navin Kumar <navink@nvidia.com>
  • Loading branch information
NVnavkumar authored Sep 18, 2023
1 parent 92f308c commit 318807e
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 3 deletions.
24 changes: 24 additions & 0 deletions integration_tests/src/main/python/regexp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,25 @@ def test_split_re_no_limit():
'split(a, "^[o]")'),
conf=_regexp_conf)

def test_split_with_dangling_brackets():
data_gen = mk_str_gen('([bf]o{0,2}[.?+\\^$|{}]{1,2}){1,7}') \
.with_special_case('boo.and.foo') \
.with_special_case('boo?and?foo') \
.with_special_case('boo+and+foo') \
.with_special_case('boo^and^foo') \
.with_special_case('boo$and$foo') \
.with_special_case('boo|and|foo') \
.with_special_case('boo{and}foo') \
.with_special_case('boo$|and$|foo')
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, data_gen).selectExpr(
'split(a, "[a-z]]")',
'split(a, "[boo]]]")',
'split(a, "[foo]}")',
'split(a, "[foo]}}")'),
conf=_regexp_conf)


def test_split_optimized_no_re():
data_gen = mk_str_gen('([bf]o{0,2}[.?+\\^$|{}]{1,2}){1,7}') \
.with_special_case('boo.and.foo') \
Expand All @@ -134,6 +153,11 @@ def test_split_optimized_no_re():
.with_special_case('boo$|and$|foo')
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, data_gen).selectExpr(
'split(a, "]")',
'split(a, "]]")',
'split(a, "}")',
'split(a, "}}")',
'split(a, ",")',
'split(a, "\\\\.")',
'split(a, "\\\\?")',
'split(a, "\\\\+")',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ class RegexParser(pattern: String) {
parseGroup()
case '[' =>
parseCharacterClass()
case ']' =>
RegexEscaped(']')
case '}' =>
RegexEscaped('}')
case '\\' =>
parseEscapedCharacter()
case '\u0000' =>
Expand Down Expand Up @@ -1857,7 +1861,7 @@ sealed case class RegexChar(ch: Char) extends RegexCharacterClassComponent {
override def toRegexString: String = ch.toString
}

sealed case class RegexEscaped(a: Char) extends RegexCharacterClassComponent{
sealed case class RegexEscaped(a: Char) extends RegexCharacterClassComponent {
def this(a: Char, position: Int) {
this(a)
this.position = Some(position)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class RegularExpressionParserSuite extends AnyFunSuite {
test("not a quantifier") {
assert(parse("{1}") ===
RegexSequence(ListBuffer(
RegexChar('{'), RegexChar('1'),RegexChar('}'))))
RegexChar('{'), RegexChar('1'),RegexEscaped('}'))))
}

test("nested repetition") {
Expand Down Expand Up @@ -109,7 +109,7 @@ class RegularExpressionParserSuite extends AnyFunSuite {
assert(parse("[a]]") ===
RegexSequence(ListBuffer(
RegexCharacterClass(negated = false,
ListBuffer(RegexChar('a'))), RegexChar(']'))))
ListBuffer(RegexChar('a'))), RegexEscaped(']'))))
}

test("escaped brackets") {
Expand Down

0 comments on commit 318807e

Please sign in to comment.