Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support DictionaryString for Regex matching operators #12768

Merged
merged 16 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/expr-common/src/interval_arithmetic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1753,7 +1753,7 @@ impl NullableInterval {
}
_ => Ok(Self::MaybeNull { values }),
}
} else if op.is_comparison_operator() {
} else if op.supports_propagation() {
Ok(Self::Null {
datatype: DataType::Boolean,
})
Expand Down
7 changes: 4 additions & 3 deletions datafusion/expr-common/src/operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,11 @@ impl Operator {
)
}

/// Return true if the operator is a comparison operator.
/// Return true if the comparison operator can be used in interval arithmetic and constraint
/// propagation
///
/// For example, 'Binary(a, >, b)' would be a comparison expression.
pub fn is_comparison_operator(&self) -> bool {
/// For example, 'Binary(a, >, b)' expression supports propagation.
pub fn supports_propagation(&self) -> bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is technically an API change -- Can you please avoid the change and help people upgrade by adding back is_comparison_operator that calls supports_propagation and mark it deprecated?

#[deprecated(
since = "40.0.0",
note = "please use `statistics_from_parquet_meta_calc` instead"
)]
pub async fn statistics_from_parquet_meta(
metadata: &ParquetMetaData,
table_schema: SchemaRef,
) -> Result<Statistics> {
statistics_from_parquet_meta_calc(metadata, table_schema)
}

matches!(
self,
Operator::Eq
Expand Down
2 changes: 1 addition & 1 deletion datafusion/optimizer/src/unwrap_cast_in_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
};
is_supported_type(&left_type)
&& is_supported_type(&right_type)
&& op.is_comparison_operator()
&& op.supports_propagation()
} =>
{
match (left.as_mut(), right.as_mut()) {
Expand Down
59 changes: 47 additions & 12 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,37 @@ macro_rules! compute_utf8_flag_op {
}

macro_rules! binary_string_array_flag_op_scalar {
($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
// This macro is slightly different from binary_string_array_flag_op because, when comparing with a scalar value,
// the query can be optimized in such a way that operands will be dicts, so we need to support it here
let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
DataType::Utf8View | DataType::Utf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
},
DataType::LargeUtf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
},
DataType::Dictionary(_, _) => {
let values = $LEFT.as_any_dictionary().values();

match values.data_type() {
DataType::Utf8View | DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG),
DataType::LargeUtf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG),
other => internal_err!(
"Data type {:?} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array",
other, stringify!($OP)
),
}.map(
// downcast_dictionary_array duplicates code per possible key type, so we aim to do all prep work before
|evaluated_values| downcast_dictionary_array! {
$LEFT => {
let unpacked_dict = evaluated_values.take_iter($LEFT.keys().iter().map(|opt| opt.map(|v| v as _))).collect::<BooleanArray>();
Arc::new(unpacked_dict) as _
},
_ => unreachable!(),
}
)
},
other => internal_err!(
"Data type {:?} not supported for binary_string_array_flag_op_scalar operation '{}' on string array",
other, stringify!($OP)
Expand All @@ -211,20 +234,32 @@ macro_rules! compute_utf8_flag_op_scalar {
.downcast_ref::<$ARRAYTYPE>()
.expect("compute_utf8_flag_op_scalar failed to downcast array");

if let ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) = $RIGHT {
let flag = $FLAG.then_some("i");
let mut array =
paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
if $NOT {
array = not(&array).unwrap();
}
Ok(Arc::new(array))
} else {
internal_err!(
let string_value = match $RIGHT {
ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) => string_value,
ScalarValue::Dictionary(_, value) => {
match *value {
ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) => string_value,
other => return internal_err!(
"compute_utf8_flag_op_scalar failed to cast dictionary value {} for operation '{}'",
other, stringify!($OP)
)
}
},
_ => return internal_err!(
"compute_utf8_flag_op_scalar failed to cast literal value {} for operation '{}'",
$RIGHT, stringify!($OP)
)

};

let flag = $FLAG.then_some("i");
let mut array =
paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
if $NOT {
array = not(&array).unwrap();
}

Ok(Arc::new(array))
}};
}

Expand Down Expand Up @@ -429,7 +464,7 @@ impl PhysicalExpr for BinaryExpr {
// end-points of its children.
Ok(Some(vec![]))
}
} else if self.op.is_comparison_operator() {
} else if self.op.supports_propagation() {
Ok(
propagate_comparison(&self.op, interval, left_interval, right_interval)?
.map(|(left, right)| vec![left, right]),
Expand Down
13 changes: 0 additions & 13 deletions datafusion/sqllogictest/test_files/string/large_string.slt
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,6 @@ Xiangpeng datafusion数据融合 false true false true
Raphael datafusionДатаФусион false false false false
NULL NULL NULL NULL NULL NULL

# TODO: move it back to `string_query.slt.part` after fixing the issue
# https://github.com/apache/datafusion/issues/12618
query BB
SELECT
ascii_1 ~* '^a.{3}e',
unicode_1 ~* '^d.*Фу'
FROM test_basic_operator;
----
true false
false false
false true
NULL NULL

#
# common test for string-like functions and operators
#
Expand Down
13 changes: 0 additions & 13 deletions datafusion/sqllogictest/test_files/string/string.slt
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,6 @@ statement ok
create table test_substr as
select arrow_cast(col1, 'Utf8') as c1 from test_substr_base;

# TODO: move it back to `string_query.slt.part` after fixing the issue
# https://github.com/apache/datafusion/issues/12618
query BB
SELECT
ascii_1 ~* '^a.{3}e',
unicode_1 ~* '^d.*Фу'
FROM test_basic_operator;
----
true false
false false
false true
NULL NULL

# TODO: move it back to `string_query.slt.part` after fixing the issue
# see detail: https://github.com/apache/datafusion/issues/12637
# Test pattern with wildcard characters
Expand Down
22 changes: 10 additions & 12 deletions datafusion/sqllogictest/test_files/string/string_query.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -642,18 +642,16 @@ true false
false true
NULL NULL

# TODO: DictionaryString does not support ~* operator. Enable this after fixing the issue
# see issue: https://github.com/apache/datafusion/issues/12618
#query BB
#SELECT
# ascii_1 ~* '^a.{3}e',
# unicode_1 ~* '^d.*Фу'
#FROM test_basic_operator;
#----
#true false
#false false
#false true
#NULL NULL
query BB
SELECT
ascii_1 ~* '^a.{3}e',
unicode_1 ~* '^d.*Фу'
FROM test_basic_operator;
----
true false
false false
false true
NULL NULL

query BB
SELECT
Expand Down
13 changes: 0 additions & 13 deletions datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,6 @@ select arrow_cast(col1, 'Utf8View') as c1 from test_substr_base;
statement ok
drop table test_source

# TODO: move it back to `string_query.slt.part` after fixing the issue
# https://github.com/apache/datafusion/issues/12618
query BB
SELECT
ascii_1 ~* '^a.{3}e',
unicode_1 ~* '^d.*Фу'
FROM test_basic_operator;
----
true false
false false
false true
NULL NULL

#
# common test for string-like functions and operators
#
Expand Down