Skip to content

Commit

Permalink
Support automatic Binary/LargeBinary --> Utf8 coercion
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Oct 16, 2023
1 parent 2a4e7f7 commit dd8f41e
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 8 deletions.
5 changes: 2 additions & 3 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1558,10 +1558,9 @@ macro_rules! make_utf8_to_return_type {
}
};
}

/// `utf8_to_str_type` returns either a Utf8 or LargeUtf8 based on the input type size.
// `utf8_to_str_type` returns either a Utf8 or LargeUtf8 based on the input type size.
make_utf8_to_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8);
/// `utf8_to_str_type` returns either a Int32 or Int64 based on the input type size.
// `utf8_to_str_type` returns either a Int32 or Int64 based on the input type size.
make_utf8_to_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);

fn utf8_or_binary_to_binary_type(arg_type: &DataType, name: &str) -> Result<DataType> {
Expand Down
12 changes: 8 additions & 4 deletions datafusion/expr/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -650,13 +650,17 @@ fn string_concat_internal_coercion(

/// Coercion rules for Strings: the type that both lhs and rhs can be
/// casted to for the purpose of a string computation
///
/// Note this also permits coercing `Binary` and `LargeBinary` types to
/// `Utf8` and `LargeUtf`, which will actually generate an error at runtime
/// if the binary field holds invalid Utf8 data
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
(Utf8, Utf8) => Some(Utf8),
(LargeUtf8, Utf8) => Some(LargeUtf8),
(Utf8, LargeUtf8) => Some(LargeUtf8),
(LargeUtf8, LargeUtf8) => Some(LargeUtf8),
(Utf8, Utf8 | Binary) => Some(Utf8),
(Utf8 | Binary, LargeUtf8) => Some(LargeUtf8),
(LargeUtf8 | LargeBinary, Utf8 | Binary) => Some(LargeUtf8),
(LargeUtf8 | LargeBinary, LargeUtf8 | LargeBinary) => Some(LargeUtf8),
// TODO: cast between array elements (#6558)
(List(_), List(_)) => Some(lhs_type.clone()),
(List(_), _) => Some(lhs_type.clone()),
Expand Down
12 changes: 11 additions & 1 deletion datafusion/sqllogictest/test_files/binary.slt
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,9 @@ NULL
query error DataFusion error: type_coercion
SELECT binary FROM t where binary LIKE '%F';

query error DataFusion error: type_coercion
query ?
SELECT largebinary FROM t where largebinary LIKE '%F';
----


# character_length function
Expand All @@ -239,6 +240,15 @@ NULL NULL NULL NULL
Bar 3 Bar 3
FooBar 6 FooBar 6

query I
SELECT character_length(X'20');
----
1

# still errors on values that can not be coerced to utf8
query error Encountered non UTF\-8 data: invalid utf\-8 sequence of 1 bytes from index 0
SELECT character_length(X'c328');

# regexp_replace
query TTTT
SELECT
Expand Down

0 comments on commit dd8f41e

Please sign in to comment.