From 8720f021c7f4f03d239c73e08328a76eee6980eb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 2 Oct 2024 14:23:10 -0400 Subject: [PATCH] remove upstreamed implementation --- datafusion/functions/src/lib.rs | 3 - datafusion/functions/src/regexp_common.rs | 123 -------------------- datafusion/functions/src/string/contains.rs | 20 ++-- 3 files changed, 10 insertions(+), 136 deletions(-) delete mode 100644 datafusion/functions/src/regexp_common.rs diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index bb680f3c67de..81be5552666d 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -92,9 +92,6 @@ pub mod macros; pub mod string; make_stub_package!(string, "string_expressions"); -#[cfg(feature = "string_expressions")] -mod regexp_common; - /// Core datafusion expressions /// Enabled via feature flag `core_expressions` #[cfg(feature = "core_expressions")] diff --git a/datafusion/functions/src/regexp_common.rs b/datafusion/functions/src/regexp_common.rs deleted file mode 100644 index 748c1a294f97..000000000000 --- a/datafusion/functions/src/regexp_common.rs +++ /dev/null @@ -1,123 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Common utilities for implementing regex functions - -use crate::string::common::StringArrayType; - -use arrow::array::{Array, ArrayDataBuilder, BooleanArray}; -use arrow::datatypes::DataType; -use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; -use datafusion_common::DataFusionError; -use regex::Regex; - -use std::collections::HashMap; - -#[cfg(doc)] -use arrow::array::{LargeStringArray, StringArray, StringViewArray}; -/// Perform SQL `array ~ regex_array` operation on -/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`]. -/// -/// If `regex_array` element has an empty value, the corresponding result value is always true. -/// -/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag, -/// which allow special search modes, such as case-insensitive and multi-line mode. -/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) -/// for more information. -/// -/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs]. -/// -/// Can remove when is implemented upstream -/// -/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37 -pub fn regexp_is_match_utf8<'a, S1, S2, S3>( - array: &'a S1, - regex_array: &'a S2, - flags_array: Option<&'a S3>, -) -> datafusion_common::Result -where - &'a S1: StringArrayType<'a>, - &'a S2: StringArrayType<'a>, - &'a S3: StringArrayType<'a>, -{ - if array.len() != regex_array.len() { - return Err(DataFusionError::Execution( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); - - let mut patterns: HashMap = HashMap::new(); - let mut result = BooleanBufferBuilder::new(array.len()); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(flag) => format!("(?{flag}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - (Some(_), Some(pattern)) if pattern == *"" => { - result.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - DataFusionError::Execution(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.entry(pattern).or_insert(re) - } - }; - result.append(re.is_match(value)); - } - _ => result.append(false), - } - Ok(()) - }) - .collect::, DataFusionError>>()?; - - let data = unsafe { - ArrayDataBuilder::new(DataType::Boolean) - .len(array.len()) - .buffers(vec![result.into()]) - .nulls(nulls) - .build_unchecked() - }; - - Ok(BooleanArray::from(data)) -} diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index c319f80661c3..9b18b1df2984 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use crate::regexp_common::regexp_is_match_utf8; use crate::utils::make_scalar_function; use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray, StringViewArray}; @@ -30,6 +29,7 @@ use datafusion_expr::{ColumnarValue, Signature, Volatility}; use std::any::Any; use std::sync::Arc; +use arrow::compute::regexp_is_match; #[derive(Debug)] pub struct ContainsFunc { @@ -92,7 +92,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, Utf8View) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< StringViewArray, StringViewArray, GenericStringArray, @@ -103,7 +103,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, Utf8) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< StringViewArray, GenericStringArray, GenericStringArray, @@ -114,7 +114,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, LargeUtf8) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< StringViewArray, GenericStringArray, GenericStringArray, @@ -125,7 +125,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, Utf8View) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, StringViewArray, GenericStringArray, @@ -136,7 +136,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, Utf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -147,7 +147,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, LargeUtf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -158,7 +158,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, Utf8View) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, StringViewArray, GenericStringArray, @@ -169,7 +169,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, Utf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -180,7 +180,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, LargeUtf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray,