Skip to content

Commit

Permalink
Support DictionaryString for Regex matching operators (#12768)
Browse files Browse the repository at this point in the history
* Do not optimise regex casts in `UnwrapCastInComparison`

* Mark regex operators are non-comparison ones

* Also remove is distinct from

* Rename method for `supports_propagation` for clarity

* Remove duplicated tests

* Fix doctest

* Revert `supports_propagation` and handle dicts in `binary_string_array_flag_op_scalar`

* Update tabulation

* Minimise code inside `downcast_dictionary_array!` to fix wasm-pack build

* Switch to `as_any_dictionary`

* Use `take_iter`

* Revert `is_comparison_operator` and make it deprecated
  • Loading branch information
blaginin authored Oct 10, 2024
1 parent 939ef9e commit 4534a28
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 68 deletions.
2 changes: 1 addition & 1 deletion datafusion/expr-common/src/interval_arithmetic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1753,7 +1753,7 @@ impl NullableInterval {
}
_ => Ok(Self::MaybeNull { values }),
}
} else if op.is_comparison_operator() {
} else if op.supports_propagation() {
Ok(Self::Null {
datatype: DataType::Boolean,
})
Expand Down
16 changes: 13 additions & 3 deletions datafusion/expr-common/src/operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,11 @@ impl Operator {
)
}

/// Return true if the operator is a comparison operator.
/// Return true if the comparison operator can be used in interval arithmetic and constraint
/// propagation
///
/// For example, 'Binary(a, >, b)' would be a comparison expression.
pub fn is_comparison_operator(&self) -> bool {
/// For example, 'Binary(a, >, b)' expression supports propagation.
pub fn supports_propagation(&self) -> bool {
matches!(
self,
Operator::Eq
Expand All @@ -163,6 +164,15 @@ impl Operator {
)
}

/// Return true if the comparison operator can be used in interval arithmetic and constraint
/// propagation
///
/// For example, 'Binary(a, >, b)' expression supports propagation.
#[deprecated(since = "43.0.0", note = "please use `supports_propagation` instead")]
pub fn is_comparison_operator(&self) -> bool {
self.supports_propagation()
}

/// Return true if the operator is a logic operator.
///
/// For example, 'Binary(Binary(a, >, b), AND, Binary(a, <, b + 3))' would
Expand Down
2 changes: 1 addition & 1 deletion datafusion/optimizer/src/unwrap_cast_in_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
};
is_supported_type(&left_type)
&& is_supported_type(&right_type)
&& op.is_comparison_operator()
&& op.supports_propagation()
} =>
{
match (left.as_mut(), right.as_mut()) {
Expand Down
59 changes: 47 additions & 12 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,37 @@ macro_rules! compute_utf8_flag_op {
}

macro_rules! binary_string_array_flag_op_scalar {
($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
// This macro is slightly different from binary_string_array_flag_op because, when comparing with a scalar value,
// the query can be optimized in such a way that operands will be dicts, so we need to support it here
let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
DataType::Utf8View | DataType::Utf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
},
DataType::LargeUtf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
},
DataType::Dictionary(_, _) => {
let values = $LEFT.as_any_dictionary().values();

match values.data_type() {
DataType::Utf8View | DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG),
DataType::LargeUtf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG),
other => internal_err!(
"Data type {:?} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array",
other, stringify!($OP)
),
}.map(
// downcast_dictionary_array duplicates code per possible key type, so we aim to do all prep work before
|evaluated_values| downcast_dictionary_array! {
$LEFT => {
let unpacked_dict = evaluated_values.take_iter($LEFT.keys().iter().map(|opt| opt.map(|v| v as _))).collect::<BooleanArray>();
Arc::new(unpacked_dict) as _
},
_ => unreachable!(),
}
)
},
other => internal_err!(
"Data type {:?} not supported for binary_string_array_flag_op_scalar operation '{}' on string array",
other, stringify!($OP)
Expand All @@ -211,20 +234,32 @@ macro_rules! compute_utf8_flag_op_scalar {
.downcast_ref::<$ARRAYTYPE>()
.expect("compute_utf8_flag_op_scalar failed to downcast array");

if let ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) = $RIGHT {
let flag = $FLAG.then_some("i");
let mut array =
paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
if $NOT {
array = not(&array).unwrap();
}
Ok(Arc::new(array))
} else {
internal_err!(
let string_value = match $RIGHT {
ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) => string_value,
ScalarValue::Dictionary(_, value) => {
match *value {
ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) => string_value,
other => return internal_err!(
"compute_utf8_flag_op_scalar failed to cast dictionary value {} for operation '{}'",
other, stringify!($OP)
)
}
},
_ => return internal_err!(
"compute_utf8_flag_op_scalar failed to cast literal value {} for operation '{}'",
$RIGHT, stringify!($OP)
)

};

let flag = $FLAG.then_some("i");
let mut array =
paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
if $NOT {
array = not(&array).unwrap();
}

Ok(Arc::new(array))
}};
}

Expand Down Expand Up @@ -429,7 +464,7 @@ impl PhysicalExpr for BinaryExpr {
// end-points of its children.
Ok(Some(vec![]))
}
} else if self.op.is_comparison_operator() {
} else if self.op.supports_propagation() {
Ok(
propagate_comparison(&self.op, interval, left_interval, right_interval)?
.map(|(left, right)| vec![left, right]),
Expand Down
13 changes: 0 additions & 13 deletions datafusion/sqllogictest/test_files/string/large_string.slt
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,6 @@ Xiangpeng datafusion数据融合 false true false true
Raphael datafusionДатаФусион false false false false
NULL NULL NULL NULL NULL NULL

# TODO: move it back to `string_query.slt.part` after fixing the issue
# https://github.com/apache/datafusion/issues/12618
query BB
SELECT
ascii_1 ~* '^a.{3}e',
unicode_1 ~* '^d.*Фу'
FROM test_basic_operator;
----
true false
false false
false true
NULL NULL

#
# common test for string-like functions and operators
#
Expand Down
13 changes: 0 additions & 13 deletions datafusion/sqllogictest/test_files/string/string.slt
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,6 @@ statement ok
create table test_substr as
select arrow_cast(col1, 'Utf8') as c1 from test_substr_base;

# TODO: move it back to `string_query.slt.part` after fixing the issue
# https://github.com/apache/datafusion/issues/12618
query BB
SELECT
ascii_1 ~* '^a.{3}e',
unicode_1 ~* '^d.*Фу'
FROM test_basic_operator;
----
true false
false false
false true
NULL NULL

# TODO: move it back to `string_query.slt.part` after fixing the issue
# see detail: https://github.com/apache/datafusion/issues/12637
# Test pattern with wildcard characters
Expand Down
22 changes: 10 additions & 12 deletions datafusion/sqllogictest/test_files/string/string_query.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -642,18 +642,16 @@ true false
false true
NULL NULL

# TODO: DictionaryString does not support ~* operator. Enable this after fixing the issue
# see issue: https://github.com/apache/datafusion/issues/12618
#query BB
#SELECT
# ascii_1 ~* '^a.{3}e',
# unicode_1 ~* '^d.*Фу'
#FROM test_basic_operator;
#----
#true false
#false false
#false true
#NULL NULL
query BB
SELECT
ascii_1 ~* '^a.{3}e',
unicode_1 ~* '^d.*Фу'
FROM test_basic_operator;
----
true false
false false
false true
NULL NULL

query BB
SELECT
Expand Down
13 changes: 0 additions & 13 deletions datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,6 @@ select arrow_cast(col1, 'Utf8View') as c1 from test_substr_base;
statement ok
drop table test_source

# TODO: move it back to `string_query.slt.part` after fixing the issue
# https://github.com/apache/datafusion/issues/12618
query BB
SELECT
ascii_1 ~* '^a.{3}e',
unicode_1 ~* '^d.*Фу'
FROM test_basic_operator;
----
true false
false false
false true
NULL NULL

#
# common test for string-like functions and operators
#
Expand Down

0 comments on commit 4534a28

Please sign in to comment.