From 4534a285c649be8fe7dde56b25078d77d82cf002 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Thu, 10 Oct 2024 21:28:50 +0100 Subject: [PATCH] Support DictionaryString for Regex matching operators (#12768) * Do not optimise regex casts in `UnwrapCastInComparison` * Mark regex operators are non-comparison ones * Also remove is distinct from * Rename method for `supports_propagation` for clarity * Remove duplicated tests * Fix doctest * Revert `supports_propagation` and handle dicts in `binary_string_array_flag_op_scalar` * Update tabulation * Minimise code inside `downcast_dictionary_array!` to fix wasm-pack build * Switch to `as_any_dictionary` * Use `take_iter` * Revert `is_comparison_operator` and make it deprecated --- .../expr-common/src/interval_arithmetic.rs | 2 +- datafusion/expr-common/src/operator.rs | 16 ++++- .../src/unwrap_cast_in_comparison.rs | 2 +- .../physical-expr/src/expressions/binary.rs | 59 +++++++++++++++---- .../test_files/string/large_string.slt | 13 ---- .../sqllogictest/test_files/string/string.slt | 13 ---- .../test_files/string/string_query.slt.part | 22 ++++--- .../test_files/string/string_view.slt | 13 ---- 8 files changed, 72 insertions(+), 68 deletions(-) diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index 6424888c896a..e76453d91a8e 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -1753,7 +1753,7 @@ impl NullableInterval { } _ => Ok(Self::MaybeNull { values }), } - } else if op.is_comparison_operator() { + } else if op.supports_propagation() { Ok(Self::Null { datatype: DataType::Boolean, }) diff --git a/datafusion/expr-common/src/operator.rs b/datafusion/expr-common/src/operator.rs index e013b6fafa22..6ca0f04897ac 100644 --- a/datafusion/expr-common/src/operator.rs +++ b/datafusion/expr-common/src/operator.rs @@ -142,10 +142,11 @@ impl Operator { ) } - /// Return true if the operator is a comparison operator. + /// Return true if the comparison operator can be used in interval arithmetic and constraint + /// propagation /// - /// For example, 'Binary(a, >, b)' would be a comparison expression. - pub fn is_comparison_operator(&self) -> bool { + /// For example, 'Binary(a, >, b)' expression supports propagation. + pub fn supports_propagation(&self) -> bool { matches!( self, Operator::Eq @@ -163,6 +164,15 @@ impl Operator { ) } + /// Return true if the comparison operator can be used in interval arithmetic and constraint + /// propagation + /// + /// For example, 'Binary(a, >, b)' expression supports propagation. + #[deprecated(since = "43.0.0", note = "please use `supports_propagation` instead")] + pub fn is_comparison_operator(&self) -> bool { + self.supports_propagation() + } + /// Return true if the operator is a logic operator. /// /// For example, 'Binary(Binary(a, >, b), AND, Binary(a, <, b + 3))' would diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 22e3c0ddd076..31e21d08b569 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -146,7 +146,7 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter { }; is_supported_type(&left_type) && is_supported_type(&right_type) - && op.is_comparison_operator() + && op.supports_propagation() } => { match (left.as_mut(), right.as_mut()) { diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 3d9072c2e14f..47b04a876b37 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -186,7 +186,9 @@ macro_rules! compute_utf8_flag_op { } macro_rules! binary_string_array_flag_op_scalar { - ($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{ + ($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{ + // This macro is slightly different from binary_string_array_flag_op because, when comparing with a scalar value, + // the query can be optimized in such a way that operands will be dicts, so we need to support it here let result: Result> = match $LEFT.data_type() { DataType::Utf8View | DataType::Utf8 => { compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG) @@ -194,6 +196,27 @@ macro_rules! binary_string_array_flag_op_scalar { DataType::LargeUtf8 => { compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG) }, + DataType::Dictionary(_, _) => { + let values = $LEFT.as_any_dictionary().values(); + + match values.data_type() { + DataType::Utf8View | DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG), + DataType::LargeUtf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG), + other => internal_err!( + "Data type {:?} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array", + other, stringify!($OP) + ), + }.map( + // downcast_dictionary_array duplicates code per possible key type, so we aim to do all prep work before + |evaluated_values| downcast_dictionary_array! { + $LEFT => { + let unpacked_dict = evaluated_values.take_iter($LEFT.keys().iter().map(|opt| opt.map(|v| v as _))).collect::(); + Arc::new(unpacked_dict) as _ + }, + _ => unreachable!(), + } + ) + }, other => internal_err!( "Data type {:?} not supported for binary_string_array_flag_op_scalar operation '{}' on string array", other, stringify!($OP) @@ -211,20 +234,32 @@ macro_rules! compute_utf8_flag_op_scalar { .downcast_ref::<$ARRAYTYPE>() .expect("compute_utf8_flag_op_scalar failed to downcast array"); - if let ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) = $RIGHT { - let flag = $FLAG.then_some("i"); - let mut array = - paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?; - if $NOT { - array = not(&array).unwrap(); - } - Ok(Arc::new(array)) - } else { - internal_err!( + let string_value = match $RIGHT { + ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) => string_value, + ScalarValue::Dictionary(_, value) => { + match *value { + ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) => string_value, + other => return internal_err!( + "compute_utf8_flag_op_scalar failed to cast dictionary value {} for operation '{}'", + other, stringify!($OP) + ) + } + }, + _ => return internal_err!( "compute_utf8_flag_op_scalar failed to cast literal value {} for operation '{}'", $RIGHT, stringify!($OP) ) + + }; + + let flag = $FLAG.then_some("i"); + let mut array = + paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?; + if $NOT { + array = not(&array).unwrap(); } + + Ok(Arc::new(array)) }}; } @@ -429,7 +464,7 @@ impl PhysicalExpr for BinaryExpr { // end-points of its children. Ok(Some(vec![])) } - } else if self.op.is_comparison_operator() { + } else if self.op.supports_propagation() { Ok( propagate_comparison(&self.op, interval, left_interval, right_interval)? .map(|(left, right)| vec![left, right]), diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt b/datafusion/sqllogictest/test_files/string/large_string.slt index af6d104e57ac..8d8a5711bdb8 100644 --- a/datafusion/sqllogictest/test_files/string/large_string.slt +++ b/datafusion/sqllogictest/test_files/string/large_string.slt @@ -59,19 +59,6 @@ Xiangpeng datafusion数据融合 false true false true Raphael datafusionДатаФусион false false false false NULL NULL NULL NULL NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# https://github.com/apache/datafusion/issues/12618 -query BB -SELECT - ascii_1 ~* '^a.{3}e', - unicode_1 ~* '^d.*Фу' -FROM test_basic_operator; ----- -true false -false false -false true -NULL NULL - # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index f003e01ecda0..e84342abd3df 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -34,19 +34,6 @@ statement ok create table test_substr as select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; -# TODO: move it back to `string_query.slt.part` after fixing the issue -# https://github.com/apache/datafusion/issues/12618 -query BB -SELECT - ascii_1 ~* '^a.{3}e', - unicode_1 ~* '^d.*Фу' -FROM test_basic_operator; ----- -true false -false false -false true -NULL NULL - # TODO: move it back to `string_query.slt.part` after fixing the issue # see detail: https://github.com/apache/datafusion/issues/12637 # Test pattern with wildcard characters diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 6a02296f5e6c..dc5626b7d573 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -642,18 +642,16 @@ true false false true NULL NULL -# TODO: DictionaryString does not support ~* operator. Enable this after fixing the issue -# see issue: https://github.com/apache/datafusion/issues/12618 -#query BB -#SELECT -# ascii_1 ~* '^a.{3}e', -# unicode_1 ~* '^d.*Фу' -#FROM test_basic_operator; -#---- -#true false -#false false -#false true -#NULL NULL +query BB +SELECT + ascii_1 ~* '^a.{3}e', + unicode_1 ~* '^d.*Фу' +FROM test_basic_operator; +---- +true false +false false +false true +NULL NULL query BB SELECT diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index e01a40586fe0..19bea3bf6bd0 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -37,19 +37,6 @@ select arrow_cast(col1, 'Utf8View') as c1 from test_substr_base; statement ok drop table test_source -# TODO: move it back to `string_query.slt.part` after fixing the issue -# https://github.com/apache/datafusion/issues/12618 -query BB -SELECT - ascii_1 ~* '^a.{3}e', - unicode_1 ~* '^d.*Фу' -FROM test_basic_operator; ----- -true false -false false -false true -NULL NULL - # # common test for string-like functions and operators #