Skip to content

Commit

Permalink
Fix ClassCastException in regular expression transpiler (#5506)
Browse files Browse the repository at this point in the history
  • Loading branch information
andygrove authored May 18, 2022
1 parent dbca151 commit c0fe6e4
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 4 deletions.
10 changes: 10 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,16 @@ def test_re_replace():
'REGEXP_REPLACE(a, "TEST", NULL)'),
conf=_regexp_conf)

@allow_non_gpu('ProjectExec', 'RegExpReplace')
def test_re_replace_issue_5492():
# https://github.com/NVIDIA/spark-rapids/issues/5492
gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}')
assert_gpu_fallback_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'REGEXP_REPLACE(a, "[^\\\\sa-zA-Z0-9]", "x")'),
'RegExpReplace',
conf=_regexp_conf)

def test_re_replace_backrefs():
gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}TEST')
assert_gpu_and_cpu_are_equal_collect(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -846,10 +846,17 @@ class CudfRegexTranspiler(mode: RegexMode) {
case _ =>
}
val components: Seq[RegexCharacterClassComponent] = characters
.map(x => x match {
case RegexChar(ch) if "^$".contains(ch) => x
case _ => rewrite(x, replacement, None).asInstanceOf[RegexCharacterClassComponent]
})
.map {
case r @ RegexChar(ch) if "^$".contains(ch) => r
case ch => rewrite(ch, replacement, None) match {
case valid: RegexCharacterClassComponent => valid
case _ =>
// this can happen when a character class contains a meta-sequence such as
// `\s` that gets transpiled into another character class
throw new RegexUnsupportedException("Character class contains one or more " +
"characters that cannot be transpiled to supported character-class components")
}
}

if (negated) {
// There are differences between cuDF and Java handling of newlines
Expand Down

0 comments on commit c0fe6e4

Please sign in to comment.