Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support FixedSizeList Type Coercion #9108

Merged
merged 21 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 33 additions & 18 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ use crate::{
};

use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit};
use datafusion_common::{internal_err, plan_err, DataFusionError, Result};
use datafusion_common::{exec_err, plan_err, DataFusionError, Result};

use strum::IntoEnumIterator;
use strum_macros::EnumIter;
Expand Down Expand Up @@ -543,10 +543,11 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Flatten => {
fn get_base_type(data_type: &DataType) -> Result<DataType> {
match data_type {
DataType::List(field) if matches!(field.data_type(), DataType::List(_)) => get_base_type(field.data_type()),
DataType::List(field) | DataType::FixedSizeList(field, _) if matches!(field.data_type(), DataType::List(_)|DataType::FixedSizeList(_,_ )) => get_base_type(field.data_type()),
DataType::LargeList(field) if matches!(field.data_type(), DataType::LargeList(_)) => get_base_type(field.data_type()),
DataType::Null | DataType::List(_) | DataType::LargeList(_) => Ok(data_type.to_owned()),
_ => internal_err!("Not reachable, data_type should be List or LargeList"),
DataType::FixedSizeList(field,_ ) => Ok(DataType::List(field.clone())),
_ => exec_err!("Not reachable, data_type should be List, LargeList or FixedSizeList"),
}
}

Expand Down Expand Up @@ -923,51 +924,63 @@ impl BuiltinScalarFunction {
Signature::variadic_any(self.volatility())
}
BuiltinScalarFunction::ArrayAppend => {
Signature::array_and_element(self.volatility())
Signature::array_and_element(false, self.volatility())
}
BuiltinScalarFunction::MakeArray => {
// 0 or more arguments of arbitrary type
Signature::one_of(vec![VariadicEqual, Any(0)], self.volatility())
}
BuiltinScalarFunction::ArrayPopFront => Signature::any(1, self.volatility()),
BuiltinScalarFunction::ArrayPopBack => Signature::any(1, self.volatility()),
BuiltinScalarFunction::ArrayPopFront => {
Signature::array(true, self.volatility())
Weijun-H marked this conversation as resolved.
Show resolved Hide resolved
}
BuiltinScalarFunction::ArrayPopBack => {
Signature::array(true, self.volatility())
}
BuiltinScalarFunction::ArrayConcat => {
Signature::variadic_any(self.volatility())
}
BuiltinScalarFunction::ArrayDims => Signature::any(1, self.volatility()),
BuiltinScalarFunction::ArrayEmpty => Signature::any(1, self.volatility()),
BuiltinScalarFunction::ArrayDims => {
Signature::array(false, self.volatility())
}
BuiltinScalarFunction::ArrayEmpty => {
Signature::array(true, self.volatility())
}
BuiltinScalarFunction::ArrayElement => {
Signature::array_and_index(self.volatility())
Signature::array_and_index(true, self.volatility())
}
BuiltinScalarFunction::ArrayExcept => Signature::any(2, self.volatility()),
BuiltinScalarFunction::Flatten => Signature::any(1, self.volatility()),
BuiltinScalarFunction::Flatten => Signature::array(true, self.volatility()),
BuiltinScalarFunction::ArrayHasAll | BuiltinScalarFunction::ArrayHasAny => {
Signature::any(2, self.volatility())
}
BuiltinScalarFunction::ArrayHas => {
Signature::array_and_element(self.volatility())
Signature::array_and_element(true, self.volatility())
}
BuiltinScalarFunction::ArrayLength => {
Signature::variadic_any(self.volatility())
}
BuiltinScalarFunction::ArrayNdims => Signature::any(1, self.volatility()),
BuiltinScalarFunction::ArrayDistinct => Signature::any(1, self.volatility()),
BuiltinScalarFunction::ArrayNdims => {
Signature::array(false, self.volatility())
}
BuiltinScalarFunction::ArrayDistinct => {
Signature::array(true, self.volatility())
}
BuiltinScalarFunction::ArrayPosition => {
Signature::array_and_element_and_optional_index(self.volatility())
}
BuiltinScalarFunction::ArrayPositions => {
Signature::array_and_element(self.volatility())
Signature::array_and_element(true, self.volatility())
}
BuiltinScalarFunction::ArrayPrepend => {
Signature::element_and_array(self.volatility())
Signature::element_and_array(false, self.volatility())
}
BuiltinScalarFunction::ArrayRepeat => Signature::any(2, self.volatility()),
BuiltinScalarFunction::ArrayRemove => {
Signature::array_and_element(self.volatility())
Signature::array_and_element(true, self.volatility())
}
BuiltinScalarFunction::ArrayRemoveN => Signature::any(3, self.volatility()),
BuiltinScalarFunction::ArrayRemoveAll => {
Signature::array_and_element(self.volatility())
Signature::array_and_element(true, self.volatility())
}
BuiltinScalarFunction::ArrayReplace => Signature::any(3, self.volatility()),
BuiltinScalarFunction::ArrayReplaceN => Signature::any(4, self.volatility()),
Expand All @@ -981,7 +994,9 @@ impl BuiltinScalarFunction {

BuiltinScalarFunction::ArrayIntersect => Signature::any(2, self.volatility()),
BuiltinScalarFunction::ArrayUnion => Signature::any(2, self.volatility()),
BuiltinScalarFunction::Cardinality => Signature::any(1, self.volatility()),
BuiltinScalarFunction::Cardinality => {
Signature::array(false, self.volatility())
}
BuiltinScalarFunction::ArrayResize => {
Signature::variadic_any(self.volatility())
}
Expand Down
220 changes: 204 additions & 16 deletions datafusion/expr/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
//! Signature module contains foundational types that are used to represent signatures, types,
//! and return types of functions in DataFusion.

use crate::type_coercion::binary::comparison_coercion;
use arrow::datatypes::DataType;
use datafusion_common::utils::coerced_fixed_size_list_to_list;
use datafusion_common::{internal_datafusion_err, DataFusionError, Result};

/// Constant that is used as a placeholder for any valid timezone.
/// This is used where a function can accept a timestamp type with any
Expand Down Expand Up @@ -123,35 +126,211 @@ pub enum TypeSignature {
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum ArrayFunctionSignature {
/// Specialized Signature for ArrayAppend and similar functions
/// The first argument should be List/LargeList, and the second argument should be non-list or list.
/// If `allow_null` is true, the function also accepts a single argument of type Null.
/// The first argument should be List/LargeList/FixedSizedList, and the second argument should be non-list or list.
/// The second argument's list dimension should be one dimension less than the first argument's list dimension.
/// List dimension of the List/LargeList is equivalent to the number of List.
/// List dimension of the non-list is 0.
ArrayAndElement,
ArrayAndElement(bool),
/// Specialized Signature for ArrayPrepend and similar functions
/// If `allow_null` is true, the function also accepts a single argument of type Null.
/// The first argument should be non-list or list, and the second argument should be List/LargeList.
/// The first argument's list dimension should be one dimension less than the second argument's list dimension.
ElementAndArray,
ElementAndArray(bool),
/// Specialized Signature for Array functions of the form (List/LargeList, Index)
ArrayAndIndex,
/// If `allow_null` is true, the function also accepts a single argument of type Null.
/// The first argument should be List/LargeList/FixedSizedList, and the second argument should be Int64.
ArrayAndIndex(bool),
/// Specialized Signature for Array functions of the form (List/LargeList, Element, Optional Index)
ArrayAndElementAndOptionalIndex,
/// Specialized Signature for ArrayEmpty and similar functions
Weijun-H marked this conversation as resolved.
Show resolved Hide resolved
/// The function takes a single argument that must be a List/LargeList/FixedSizeList
/// or something that can be coerced to one of those types.
/// If `allow_null` is true, the function also accepts a single argument of type Null.
Array(bool),
}

impl ArrayFunctionSignature {
/// Arguments to ArrayFunctionSignature
/// `current_types` - The data types of the arguments
/// `allow_null_coercion` - Whether null type coercion is allowed
/// Returns the valid types for the function signature
pub fn get_type_signature(
Copy link
Contributor

@jayzhan211 jayzhan211 Feb 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I don't think we need to place all signature functions into get_type_signature.
Instead, having separate signature functions. And, collect what we need in the matching function.

This way the function can be reused easily. For example,

fn array() return array_type.
match {
 signature::array => { vec![vec![array()]] }
  signature::array_and_index => { vec![vec![array(), i64]]  }
}

&self,
current_types: &[DataType],
) -> Result<Vec<Vec<DataType>>> {
fn array_element_and_optional_index(
current_types: &[DataType],
) -> Result<Vec<Vec<DataType>>> {
// make sure there's 2 or 3 arguments
if !(current_types.len() == 2 || current_types.len() == 3) {
return Ok(vec![vec![]]);
}

let first_two_types = &current_types[0..2];
let mut valid_types =
array_append_or_prepend_valid_types(first_two_types, true, true)?;

// Early return if there are only 2 arguments
if current_types.len() == 2 {
return Ok(valid_types);
}

let valid_types_with_index = valid_types
.iter()
.map(|t| {
let mut t = t.clone();
t.push(DataType::Int64);
t
})
.collect::<Vec<_>>();

valid_types.extend(valid_types_with_index);

Ok(valid_types)
}

fn array_append_or_prepend_valid_types(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any reason to create these inner functions, rather than actual functions (or maybe just inline the body of the functions into the match statement below)?

I find the extra level of indenting somewhat confusing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is only for the match statement below

current_types: &[DataType],
is_append: bool,
allow_null_coercion: bool,
) -> Result<Vec<Vec<DataType>>> {
if current_types.len() != 2 {
return Ok(vec![vec![]]);
}

let (array_type, elem_type) = if is_append {
(&current_types[0], &current_types[1])
} else {
(&current_types[1], &current_types[0])
};

// We follow Postgres on `array_append(Null, T)`, which is not valid.
if array_type.eq(&DataType::Null) {
if allow_null_coercion {
return Ok(vec![vec![array_type.clone(), elem_type.clone()]]);
} else {
return Ok(vec![vec![]]);
}
}

// We need to find the coerced base type, mainly for cases like:
// `array_append(List(null), i64)` -> `List(i64)`
let array_base_type = datafusion_common::utils::base_type(array_type);
let elem_base_type = datafusion_common::utils::base_type(elem_type);
let new_base_type = comparison_coercion(&array_base_type, &elem_base_type);

let new_base_type = new_base_type.ok_or_else(|| {
internal_datafusion_err!(
"Coercion from {array_base_type:?} to {elem_base_type:?} not supported."
)
})?;

let new_array_type =
datafusion_common::utils::coerced_type_with_base_type_only(
array_type,
&new_base_type,
);

match new_array_type {
DataType::List(ref field)
| DataType::LargeList(ref field)
| DataType::FixedSizeList(ref field, _) => {
let new_elem_type = field.data_type();
if is_append {
Ok(vec![vec![new_array_type.clone(), new_elem_type.clone()]])
} else {
Ok(vec![vec![new_elem_type.to_owned(), new_array_type.clone()]])
}
}
_ => Ok(vec![vec![]]),
}
}
fn array_and_index(
current_types: &[DataType],
allow_null_coercion: bool,
) -> Result<Vec<Vec<DataType>>> {
if current_types.len() != 2 {
return Ok(vec![vec![]]);
}

let array_type = &current_types[0];

if array_type.eq(&DataType::Null) && !allow_null_coercion {
return Ok(vec![vec![]]);
}

match array_type {
jayzhan211 marked this conversation as resolved.
Show resolved Hide resolved
DataType::List(_)
| DataType::LargeList(_)
| DataType::FixedSizeList(_, _) => {
let array_type = coerced_fixed_size_list_to_list(array_type);
Ok(vec![vec![array_type, DataType::Int64]])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think it can be further simplied if there is a function that process coerced_fixed_size_list_to_list and return array_type. Then can be used both in array_and_index and array

}
DataType::Null => Ok(vec![vec![array_type.clone(), DataType::Int64]]),
_ => Ok(vec![vec![]]),
}
}
fn array(
current_types: &[DataType],
allow_null_coercion: bool,
) -> Result<Vec<Vec<DataType>>> {
if current_types.len() != 1
|| (current_types[0].is_null() && !allow_null_coercion)
{
return Ok(vec![vec![]]);
}

let array_type = &current_types[0];

match array_type {
DataType::List(_)
| DataType::LargeList(_)
| DataType::FixedSizeList(_, _) => {
let array_type = coerced_fixed_size_list_to_list(array_type);
Ok(vec![vec![array_type]])
}
DataType::Null => Ok(vec![vec![array_type.clone()]]),
_ => Ok(vec![vec![]]),
}
}
match self {
ArrayFunctionSignature::ArrayAndElement(allow_null) => {
array_append_or_prepend_valid_types(current_types, true, *allow_null)
}
ArrayFunctionSignature::ElementAndArray(allow_null) => {
array_append_or_prepend_valid_types(current_types, false, *allow_null)
}
ArrayFunctionSignature::ArrayAndIndex(allow_null) => {
array_and_index(current_types, *allow_null)
}
ArrayFunctionSignature::ArrayAndElementAndOptionalIndex => {
array_element_and_optional_index(current_types)
}
ArrayFunctionSignature::Array(allow_null) => {
array(current_types, *allow_null)
}
}
}
}

impl std::fmt::Display for ArrayFunctionSignature {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ArrayFunctionSignature::ArrayAndElement => {
write!(f, "array, element")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this change?

ArrayFunctionSignature::ArrayAndElement(allow_null) => {
write!(f, "ArrayAndElement({})", *allow_null)
}
ArrayFunctionSignature::ArrayAndElementAndOptionalIndex => {
write!(f, "array, element, [index]")

This comment was marked as outdated.

}
ArrayFunctionSignature::ElementAndArray => {
write!(f, "element, array")
ArrayFunctionSignature::ElementAndArray(allow_null) => {
write!(f, "ElementAndArray({})", *allow_null)
}
ArrayFunctionSignature::ArrayAndIndex(allow_null) => {
write!(f, "ArrayAndIndex({})", *allow_null)
}
ArrayFunctionSignature::ArrayAndIndex => {
write!(f, "array, index")
ArrayFunctionSignature::Array(allow_null) => {
write!(f, "Array({})", *allow_null)
}
}
}
Expand Down Expand Up @@ -290,10 +469,10 @@ impl Signature {
}
}
/// Specialized Signature for ArrayAppend and similar functions
pub fn array_and_element(volatility: Volatility) -> Self {
pub fn array_and_element(allow_null: bool, volatility: Volatility) -> Self {
Signature {
type_signature: TypeSignature::ArraySignature(
ArrayFunctionSignature::ArrayAndElement,
ArrayFunctionSignature::ArrayAndElement(allow_null),
),
volatility,
}
Expand All @@ -308,23 +487,32 @@ impl Signature {
}
}
/// Specialized Signature for ArrayPrepend and similar functions
pub fn element_and_array(volatility: Volatility) -> Self {
pub fn element_and_array(allow_null: bool, volatility: Volatility) -> Self {
Signature {
type_signature: TypeSignature::ArraySignature(
ArrayFunctionSignature::ElementAndArray,
ArrayFunctionSignature::ElementAndArray(allow_null),
),
volatility,
}
}
/// Specialized Signature for ArrayElement and similar functions
pub fn array_and_index(volatility: Volatility) -> Self {
pub fn array_and_index(allow_null: bool, volatility: Volatility) -> Self {
Signature {
type_signature: TypeSignature::ArraySignature(
ArrayFunctionSignature::ArrayAndIndex,
ArrayFunctionSignature::ArrayAndIndex(allow_null),
),
volatility,
}
}
/// Specialized Signature for ArrayEmpty and similar functions
pub fn array(allow_null: bool, volatility: Volatility) -> Self {
Signature {
type_signature: TypeSignature::ArraySignature(ArrayFunctionSignature::Array(
allow_null,
)),
volatility,
}
}
}

/// Monotonicity of the `ScalarFunctionExpr` with respect to its arguments.
Expand Down
Loading
Loading