diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index e643b3e3a6c..a63a5a0ba6d 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -474,6 +474,17 @@ def test_regexp_rlike_rewrite_optimization_str_dig(): 'regexp_like(a, "[0-9]{4,}")', 'regexp_like(a, "abcd([0-9]{5})")'), conf=_regexp_conf) + +# [\\u4e00-\\u9fa5]+ + +@pytest.mark.skipif(is_before_spark_320(), reason='regexp_like is synonym for RLike starting in Spark 3.2.0') +def test_regexp_rlike_rewrite_optimization_chinese(): + gen = mk_str_gen('[0-9]{0,2}([英伟达]{0,3})?[a-z]{0,2}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a', + 'regexp_like(a, "[\\u4e00-\\u9fa5]+")'), + conf=_regexp_conf) def test_regexp_replace_character_set_negated(): gen = mk_str_gen('[abcd]{0,3}[\r\n]{0,2}[abcd]{0,3}') @@ -594,6 +605,7 @@ def test_character_classes(): ), conf=_regexp_conf) +@datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/10641") def test_regexp_choice(): gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}[ \n\t\r]{0,2}') assert_gpu_and_cpu_are_equal_collect( @@ -617,7 +629,7 @@ def test_regexp_hexadecimal_digits(): gen = mk_str_gen( '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]') assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen, length=10).selectExpr( + lambda spark: unary_op_df(spark, gen).selectExpr( 'rlike(a, "\\\\x7f")', 'rlike(a, "\\\\x80")', 'rlike(a, "[\\\\xa0-\\\\xf0]")', diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index e33548868f2..bf45fc153d7 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1060,6 +1060,7 @@ object RegexprPart { case object End extends RegexprPart // $ case object Wildcard extends RegexprPart // .* or (.*) case class Digits(from: Int, to: Int) extends RegexprPart // [0-9]{a, b} + case object Chinese extends RegexprPart // Chinese characters [\u4e00-\u9fa5]+ case class Fixstring(name: String) extends RegexprPart // normal string without special characters case class Regexpr(value: String) extends RegexprPart // other strings } @@ -1096,6 +1097,8 @@ class GpuRLikeMeta( Wildcard :: parseRegexToParts(s.substring(4)) case s if s.endsWith("(.*)") => parseRegexToParts(s.substring(0, s.length - 4)) :+ Wildcard + case s if s.startsWith("[\u4e00-\u9fa5]+") => + parseRegexToParts(s.substring(0, s.length - 6)) :+ Chinese case s if s.endsWith("([0-9]{5})") => parseRegexToParts(s.substring(0, s.length - 10)) :+ Digits(5, 5) case s if s.endsWith("[0-9]{4,}") => @@ -1125,15 +1128,20 @@ class GpuRLikeMeta( case Fixstring(s) :: List(End) => { GpuEndsWith(lhs, GpuLiteral(s, StringType)) } + case Chinese :: rest + if rest == List() || rest.forall(_ == Wildcard) => { + // println(s"!!!GpuStringDigits chinese") + GpuStringDigits(lhs, GpuLiteral("", StringType), 1, 19968, 40869) + } case Digits(from, _) :: rest if rest == List() || rest.forall(_ == Wildcard) => { - // println(s"!!!GpuStringDigits1: $from") - GpuStringDigits(lhs, GpuLiteral("", StringType), from) + // println(s"!!!GpuStringDigits1") + GpuStringDigits(lhs, GpuLiteral("", StringType), from, 48, 57) } case Fixstring(s) :: Digits(from, _) :: rest if rest == List() || rest.forall(_ == Wildcard) => { - // println(s"!!!GpuStringDigits2: $s, $from") - GpuStringDigits(lhs, GpuLiteral(s, StringType), from) + // println(s"!!!GpuStringDigits2") + GpuStringDigits(lhs, GpuLiteral(s, StringType), from, 48, 57) } case Fixstring(s) :: rest if rest == List() || rest.forall(_ == Wildcard) => { @@ -1217,7 +1225,7 @@ class GpuRLikeMeta( } } -case class GpuStringDigits(left: Expression, right: Expression, from: Int) +case class GpuStringDigits(left: Expression, right: Expression, from: Int, start: Int, end: Int) extends GpuBinaryExpressionArgsAnyScalar with ImplicitCastInputTypes with NullIntolerant { override def dataType: DataType = BooleanType @@ -1225,7 +1233,7 @@ case class GpuStringDigits(left: Expression, right: Expression, from: Int) override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = { - StringDigitsPattern.stringDigitsPattern(lhs.getBase, rhs.getBase, from) + StringDigitsPattern.stringDigitsPattern(lhs.getBase, rhs.getBase, from, start, end) } override def doColumnar(numRows: Int, lhs: GpuScalar, rhs: GpuScalar): ColumnVector = {