Skip to content

Commit

Permalink
Support binary argumnets (via coercion) for like and ilike and st…
Browse files Browse the repository at this point in the history
…ring functions

fixup
  • Loading branch information
alamb committed Oct 17, 2023
1 parent e00932c commit 56514be
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 29 deletions.
23 changes: 19 additions & 4 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1523,16 +1523,29 @@ impl FromStr for BuiltinScalarFunction {
}
}

/// Creates a function that returns the return type of a string function given
/// the type of its first argument.
///
/// If the input type is `LargeUtf8` or `LargeBinary` the return type is
/// `$largeUtf8Type`,
///
/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`,
macro_rules! make_utf8_to_return_type {
($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => {
fn $FUNC(arg_type: &DataType, name: &str) -> Result<DataType> {
Ok(match arg_type {
DataType::LargeUtf8 => $largeUtf8Type,
DataType::Utf8 => $utf8Type,
DataType::LargeUtf8 => $largeUtf8Type,
// LargeBinary inputs are automatically coerced to Utf8
DataType::LargeBinary => $largeUtf8Type,
DataType::Utf8 => $utf8Type,
// Binary inputs are automatically coerced to Utf8
DataType::Binary => $utf8Type,
DataType::Null => DataType::Null,
DataType::Dictionary(_, value_type) => match **value_type {
DataType::LargeUtf8 => $largeUtf8Type,
DataType::LargeUtf8 => $largeUtf8Type,
DataType::LargeBinary => $largeUtf8Type,
DataType::Utf8 => $utf8Type,
DataType::Binary => $utf8Type,
DataType::Null => DataType::Null,
_ => {
return plan_err!(
Expand All @@ -1553,8 +1566,10 @@ macro_rules! make_utf8_to_return_type {
}
};
}

// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size.
make_utf8_to_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8);

// `utf8_to_str_type`: returns either a Int32 or Int64 based on the input type size.
make_utf8_to_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);

fn utf8_or_binary_to_binary_type(arg_type: &DataType, name: &str) -> Result<DataType> {
Expand Down
113 changes: 99 additions & 14 deletions datafusion/expr/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ use datafusion_common::{plan_err, DataFusionError};

use crate::Operator;

/// The type signature of an instantiation of binary expression
/// The type signature of an instantiation of binary operator expression such as
/// `lhs + rhs`
///
/// Note this is different than [`crate::signature::Signature`] which
/// describes the type signature of a function.
struct Signature {
/// The type to coerce the left argument to
lhs: DataType,
Expand Down Expand Up @@ -648,8 +652,9 @@ fn string_concat_internal_coercion(
}
}

/// Coercion rules for Strings: the type that both lhs and rhs can be
/// casted to for the purpose of a string computation
/// Coercion rules for string types (Utf8/LargeUtf8): If at least on argument is
/// a string type and both arguments can be coerced into a string type, coerce
/// to string type.
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
Expand All @@ -665,8 +670,30 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
}
}

/// Coercion rules for Binaries: the type that both lhs and rhs can be
/// casted to for the purpose of a computation
/// Coercion rules for binary (Binary/LargeBinary) to string (Utf8/LargeUtf8):
/// If one argument is binary and the other is a string then coerce to string
/// (e.g. for `like`)
fn binary_to_string_coercion(
lhs_type: &DataType,
rhs_type: &DataType,
) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
(Binary, Utf8) => Some(Utf8),
(Binary, LargeUtf8) => Some(LargeUtf8),
(LargeBinary, Utf8) => Some(LargeUtf8),
(LargeBinary, LargeUtf8) => Some(LargeUtf8),
(Utf8, Binary) => Some(Utf8),
(Utf8, LargeBinary) => Some(LargeUtf8),
(LargeUtf8, Binary) => Some(LargeUtf8),
(LargeUtf8, LargeBinary) => Some(LargeUtf8),
_ => None,
}
}

/// Coercion rules for binary types (Binary/LargeBinary): If at least on argument is
/// a binary type and both arguments can be coerced into a binary type, coerce
/// to binary type.
fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
Expand All @@ -681,6 +708,7 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
/// This is a union of string coercion rules and dictionary coercion rules
pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
string_coercion(lhs_type, rhs_type)
.or_else(|| binary_to_string_coercion(lhs_type, rhs_type))
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
.or_else(|| null_coercion(lhs_type, rhs_type))
}
Expand Down Expand Up @@ -763,7 +791,7 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTyp
}
}

/// coercion rules from NULL type. Since NULL can be casted to most of types in arrow,
/// coercion rules from NULL type. Since NULL can be casted to any other type in arrow,
/// either lhs or rhs is NULL, if NULL can be casted to type of the other side, the coercion is valid.
fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
match (lhs_type, rhs_type) {
Expand Down Expand Up @@ -917,11 +945,33 @@ mod tests {
);
}

/// Test coercion rules for binary operators
///
/// Applies coercion rules for `$LHS_TYPE $OP $RHS_TYPE` and asserts that the
/// the result type is `$RESULT_TYPE`
macro_rules! test_coercion_binary_rule {
($A_TYPE:expr, $B_TYPE:expr, $OP:expr, $C_TYPE:expr) => {{
let (lhs, rhs) = get_input_types(&$A_TYPE, &$OP, &$B_TYPE)?;
assert_eq!(lhs, $C_TYPE);
assert_eq!(rhs, $C_TYPE);
($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{
let (lhs, rhs) = get_input_types(&$LHS_TYPE, &$OP, &$RHS_TYPE)?;
assert_eq!(lhs, $RESULT_TYPE);
assert_eq!(rhs, $RESULT_TYPE);
}};
}

/// Test coercion rules for like
///
/// Applies coercion rules for both
/// * `$LHS_TYPE LIKE $RHS_TYPE`
/// * `$RHS_TYPE LIKE $LHS_TYPE`
///
/// And asserts the result type is `$RESULT_TYPE`
macro_rules! test_like_rule {
($LHS_TYPE:expr, $RHS_TYPE:expr, $RESULT_TYPE:expr) => {{
println!("Coercing {} LIKE {}", $LHS_TYPE, $RHS_TYPE);
let result = like_coercion(&$LHS_TYPE, &$RHS_TYPE);
assert_eq!(result, $RESULT_TYPE);
// reverse the order
let result = like_coercion(&$RHS_TYPE, &$LHS_TYPE);
assert_eq!(result, $RESULT_TYPE);
}};
}

Expand All @@ -948,11 +998,46 @@ mod tests {
}

#[test]
fn test_type_coercion() -> Result<()> {
// test like coercion rule
let result = like_coercion(&DataType::Utf8, &DataType::Utf8);
assert_eq!(result, Some(DataType::Utf8));
fn test_like_coercion() {
// string coerce to strings
test_like_rule!(DataType::Utf8, DataType::Utf8, Some(DataType::Utf8));
test_like_rule!(
DataType::LargeUtf8,
DataType::Utf8,
Some(DataType::LargeUtf8)
);
test_like_rule!(
DataType::Utf8,
DataType::LargeUtf8,
Some(DataType::LargeUtf8)
);
test_like_rule!(
DataType::LargeUtf8,
DataType::LargeUtf8,
Some(DataType::LargeUtf8)
);

// Also coerce binary to strings
test_like_rule!(DataType::Binary, DataType::Utf8, Some(DataType::Utf8));
test_like_rule!(
DataType::LargeBinary,
DataType::Utf8,
Some(DataType::LargeUtf8)
);
test_like_rule!(
DataType::Binary,
DataType::LargeUtf8,
Some(DataType::LargeUtf8)
);
test_like_rule!(
DataType::LargeBinary,
DataType::LargeUtf8,
Some(DataType::LargeUtf8)
);
}

#[test]
fn test_type_coercion() -> Result<()> {
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Date32,
Expand Down
11 changes: 10 additions & 1 deletion datafusion/expr/src/type_coercion/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ pub fn data_types(
return Ok(current_types.to_vec());
}

// Try and coerce the argument types to match the signature, returning the
// coerced types from the first matching signature.
for valid_types in valid_types {
if let Some(types) = maybe_data_types(&valid_types, current_types) {
return Ok(types);
Expand All @@ -60,6 +62,7 @@ pub fn data_types(
)
}

/// Returns a Vec of all possible valid argument types for the given signature.
fn get_valid_types(
signature: &TypeSignature,
current_types: &[DataType],
Expand Down Expand Up @@ -104,7 +107,12 @@ fn get_valid_types(
Ok(valid_types)
}

/// Try to coerce current_types into valid_types.
/// Try to coerce the current argument types to match the given `valid_types`.
///
/// For example, if a function `func` accepts arguments of `(int64, int64)`,
/// but was called with `(int32, int64)`, this function could match the
/// valid_types by by coercing the first argument to `int64`, and would return
/// `Some([int64, int64])`.
fn maybe_data_types(
valid_types: &[DataType],
current_types: &[DataType],
Expand Down Expand Up @@ -220,6 +228,7 @@ fn coerced_from<'a>(
Some(type_into.clone())
}
Interval(_) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into.clone()),
// Any type can be coerced into strings
Utf8 | LargeUtf8 => Some(type_into.clone()),
Null if can_cast_types(type_from, type_into) => Some(type_into.clone()),

Expand Down
41 changes: 31 additions & 10 deletions datafusion/sqllogictest/test_files/binary.slt
Original file line number Diff line number Diff line change
Expand Up @@ -217,30 +217,51 @@ SELECT largebinary FROM t ORDER BY largebinary;
NULL

# LIKE
# https://github.com/apache/arrow-datafusion/issues/7342
query error DataFusion error: type_coercion
SELECT binary FROM t where binary LIKE '%F';

query error DataFusion error: type_coercion
SELECT largebinary FROM t where largebinary LIKE '%F';
query ?
SELECT binary FROM t where binary LIKE '%F%';
----
466f6f
466f6f426172

query ?
SELECT largebinary FROM t where largebinary LIKE '%F%';
----
466f6f
466f6f426172

# character_length function
# https://github.com/apache/arrow-datafusion/issues/7344
query error DataFusion error: Error during planning: The "character_length" function can only accept strings, but got Binary\.
query TITI
SELECT
cast(binary as varchar) as str,
character_length(binary) as binary_len,
cast(largebinary as varchar) as large_str,
character_length(binary) as largebinary_len
from t;
----
Foo 3 Foo 3
NULL NULL NULL NULL
Bar 3 Bar 3
FooBar 6 FooBar 6

query I
SELECT character_length(X'20');
----
1

# still errors on values that can not be coerced to utf8
query error Encountered non UTF\-8 data: invalid utf\-8 sequence of 1 bytes from index 0
SELECT character_length(X'c328');

# regexp_replace
# https://github.com/apache/arrow-datafusion/issues/7345
query error DataFusion error: Error during planning: The "regexp_replace" function can only accept strings, but got Binary\.
query TTTT
SELECT
cast(binary as varchar) as str,
regexp_replace(binary, 'F', 'f') as binary_replaced,
cast(largebinary as varchar) as large_str,
regexp_replace(largebinary, 'F', 'f') as large_binary_replaced
from t;
----
Foo foo Foo foo
NULL NULL NULL NULL
Bar Bar Bar Bar
FooBar fooBar FooBar fooBar

0 comments on commit 56514be

Please sign in to comment.