Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add append_many to dictionary arrays to allow adding repeated values #6534

Merged
merged 3 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,74 @@ mod tests {
assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4]));
}

#[test]
fn test_dictionary_builder_append_many() {
let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();

builder.append(1).unwrap();
builder.append_n(2, 2).unwrap();
builder.append_options(None, 2);
builder.append_options(Some(3), 3);

let array = builder.finish();

let values = array
.values()
.as_primitive::<UInt32Type>()
.iter()
.map(Option::unwrap)
.collect::<Vec<_>>();
assert_eq!(values, &[1, 2, 3]);
let keys = array.keys().iter().collect::<Vec<_>>();
assert_eq!(
keys,
&[
Some(0),
Some(1),
Some(1),
None,
None,
Some(2),
Some(2),
Some(2)
]
);
}

#[test]
fn test_string_dictionary_builder_append_many() {
let mut builder = StringDictionaryBuilder::<Int8Type>::new();

builder.append("a").unwrap();
builder.append_n("b", 2).unwrap();
builder.append_options(None::<&str>, 2);
builder.append_options(Some("c"), 3);

let array = builder.finish();

let values = array
.values()
.as_string::<i32>()
.iter()
.map(Option::unwrap)
.collect::<Vec<_>>();
assert_eq!(values, &["a", "b", "c"]);
let keys = array.keys().iter().collect::<Vec<_>>();
assert_eq!(
keys,
&[
Some(0),
Some(1),
Some(1),
None,
None,
Some(2),
Some(2),
Some(2)
]
);
}

#[test]
fn test_dictionary_array_fmt_debug() {
let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
Expand Down
58 changes: 50 additions & 8 deletions arrow-array/src/builder/generic_bytes_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,7 @@ where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
/// Append a value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
///
/// Returns an error if the new index would overflow the key type.
pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
fn get_or_insert_key(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
let value_native: &T::Native = value.as_ref();
let value_bytes: &[u8] = value_native.as_ref();

Expand All @@ -223,8 +218,32 @@ where
.get();

let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;

Ok(key)
}

/// Append a value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
///
/// Returns an error if the new index would overflow the key type.
pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value(key);
Ok(key)
}

/// Append a value multiple times to the array.
/// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
///
/// Returns an error if the new index would overflow the key type.
pub fn append_n(
&mut self,
value: impl AsRef<T::Native>,
count: usize,
) -> Result<K::Native, ArrowError> {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value_repeated(key, count);
Ok(key)
}

Expand All @@ -237,6 +256,17 @@ where
self.append(value).expect("dictionary key overflow");
}

/// Infallibly append a value to this builder repeatedly `count` times.
/// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_values(&mut self, value: impl AsRef<T::Native>, count: usize) {
self.append_n(value, count)
.expect("dictionary key overflow");
}

/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
Expand All @@ -256,6 +286,19 @@ where
};
}

/// Append an `Option` value into the builder repeatedly `count` times.
/// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_options(&mut self, value: Option<impl AsRef<T::Native>>, count: usize) {
match value {
None => self.keys_builder.append_nulls(count),
Some(v) => self.append_values(v, count),
};
}

/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
Expand Down Expand Up @@ -331,8 +374,7 @@ fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[
/// // The builder builds the dictionary value by value
/// builder.append("abc").unwrap();
/// builder.append_null();
/// builder.append("def").unwrap();
/// builder.append("def").unwrap();
/// builder.append_n("def", 2).unwrap(); // appends "def" twice with a single lookup
/// builder.append("abc").unwrap();
/// let array = builder.finish();
///
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/builder/primitive_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,13 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
self.values_builder.append(v);
}

/// Appends a value of type `T` into the builder `n` times
#[inline]
pub fn append_value_repeated(&mut self, v: T::Native, n: usize) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is kind of sad that the naming of this can't be consistent, but I don't really have any better suggestions. Perhaps append_value_n??

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I noticed the names are getting a bit messy and mixed up. I'm happy to go with that suggestion.

self.null_buffer_builder.append_n_non_nulls(n);
self.values_builder.append_n(n, v);
}

/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
Expand Down
62 changes: 50 additions & 12 deletions arrow-array/src/builder/primitive_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray};
use arrow_buffer::{ArrowNativeType, ToByteSlice};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;

Expand Down Expand Up @@ -210,26 +209,41 @@ where
K: ArrowDictionaryKeyType,
V: ArrowPrimitiveType,
{
/// Append a primitive value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
#[inline]
pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
let key = match self.map.entry(Value(value)) {
Entry::Vacant(vacant) => {
// Append new value.
fn get_or_insert_key(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
match self.map.get(&Value(value)) {
Some(&key) => {
Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
}
None => {
let key = self.values_builder.len();
self.values_builder.append_value(value);
vacant.insert(key);
K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?
self.map.insert(Value(value), key);
Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
}
Entry::Occupied(o) => K::Native::usize_as(*o.get()),
};
}
}

/// Append a primitive value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
#[inline]
pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value(key);
Ok(key)
}

/// Append a value multiple times to the array.
/// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
///
/// Returns an error if the new index would overflow the key type.
pub fn append_n(&mut self, value: V::Native, count: usize) -> Result<K::Native, ArrowError> {
let key = self.get_or_insert_key(value)?;
self.keys_builder.append_value_repeated(key, count);
Ok(key)
}

/// Infallibly append a value to this builder
///
/// # Panics
Expand All @@ -240,6 +254,17 @@ where
self.append(value).expect("dictionary key overflow");
}

/// Infallibly append a value to this builder repeatedly `count` times.
/// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_values(&mut self, value: V::Native, count: usize) {
self.append_n(value, count)
.expect("dictionary key overflow");
}

/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
Expand All @@ -259,6 +284,19 @@ where
};
}

/// Append an `Option` value into the builder repeatedly `count` times.
/// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_options(&mut self, value: Option<V::Native>, count: usize) {
match value {
None => self.keys_builder.append_nulls(count),
Some(v) => self.append_values(v, count),
};
}

/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.map.clear();
Expand Down
Loading