Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Improve binview extend/ifthenelse #18164

Merged
merged 1 commit into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
let buffers = self.buffers.as_ref();

for view in self.views.as_ref() {
unsafe { mutable.push_view_copied_unchecked(*view, buffers) }
unsafe { mutable.push_view_unchecked(*view, buffers) }
}
mutable.freeze().with_validity(self.validity)
}
Expand Down
13 changes: 8 additions & 5 deletions crates/polars-arrow/src/array/binview/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
/// - caller must allocate enough capacity
/// - caller must ensure the view and buffers match.
/// - The array must not have validity.
pub(crate) unsafe fn push_view_copied_unchecked(&mut self, v: View, buffers: &[Buffer<u8>]) {
pub(crate) unsafe fn push_view_unchecked(&mut self, v: View, buffers: &[Buffer<u8>]) {
let len = v.length;
self.total_bytes_len += len as usize;
if len <= 12 {
Expand All @@ -165,7 +165,7 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
/// - caller must ensure the view and buffers match.
/// - The array must not have validity.
/// - caller must not mix use this function with other push functions.
pub unsafe fn push_view_unchecked(&mut self, mut v: View, buffers: &[Buffer<u8>]) {
pub unsafe fn push_view_unchecked_dedupe(&mut self, mut v: View, buffers: &[Buffer<u8>]) {
let len = v.length;
self.total_bytes_len += len as usize;
if len <= 12 {
Expand Down Expand Up @@ -438,14 +438,17 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
/// # Safety
/// Same as `push_view_unchecked()`.
#[inline]
pub unsafe fn extend_non_null_views_trusted_len_unchecked<I>(
pub unsafe fn extend_non_null_views_unchecked_dedupe<I>(
&mut self,
iterator: I,
buffers: &[Buffer<u8>],
) where
I: TrustedLen<Item = View>,
I: Iterator<Item = View>,
{
self.extend_non_null_views_unchecked(iterator, buffers);
self.reserve(iterator.size_hint().0);
for v in iterator {
self.push_view_unchecked_dedupe(v, buffers);
}
}

#[inline]
Expand Down
17 changes: 16 additions & 1 deletion crates/polars-arrow/src/array/growable/binview.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::ops::Deref;
use std::sync::Arc;

use polars_utils::aliases::{InitHashMaps, PlHashSet};

use super::Growable;
use crate::array::binview::{BinaryViewArrayGeneric, ViewType};
use crate::array::growable::utils::{extend_validity, extend_validity_copies, prepare_validity};
Expand All @@ -18,6 +20,7 @@ pub struct GrowableBinaryViewArray<'a, T: ViewType + ?Sized> {
inner: MutableBinaryViewArray<T>,
same_buffers: Option<&'a Arc<[Buffer<u8>]>>,
total_same_buffers_len: usize, // Only valid if same_buffers is Some.
has_duplicate_buffers: bool,
}

impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> {
Expand Down Expand Up @@ -51,13 +54,22 @@ impl<'a, T: ViewType + ?Sized> GrowableBinaryViewArray<'a, T> {
.then(|| arrays[0].total_buffer_len())
.unwrap_or_default();

let mut duplicates = PlHashSet::new();
let mut has_duplicate_buffers = false;
for arr in arrays.iter() {
if !duplicates.insert(arr.data_buffers().as_ptr()) {
has_duplicate_buffers = true;
break;
}
}
Self {
arrays,
data_type,
validity: prepare_validity(use_validity, capacity),
inner: MutableBinaryViewArray::<T>::with_capacity(capacity),
same_buffers,
total_same_buffers_len,
has_duplicate_buffers,
}
}

Expand Down Expand Up @@ -97,9 +109,12 @@ impl<'a, T: ViewType + ?Sized> Growable<'a> for GrowableBinaryViewArray<'a, T> {
.views
.extend(views_iter.inspect(|v| total_len += v.length as usize));
self.inner.total_bytes_len += total_len;
} else if self.has_duplicate_buffers {
self.inner
.extend_non_null_views_unchecked_dedupe(views_iter, local_buffers.deref());
} else {
self.inner
.extend_non_null_views_trusted_len_unchecked(views_iter, local_buffers.deref());
.extend_non_null_views_unchecked(views_iter, local_buffers.deref());
}
}

Expand Down
68 changes: 52 additions & 16 deletions crates/polars-compute/src/if_then_else/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use arrow::array::{Array, BinaryViewArray, MutablePlBinary, Utf8ViewArray, View}
use arrow::bitmap::Bitmap;
use arrow::buffer::Buffer;
use arrow::datatypes::ArrowDataType;
use polars_utils::aliases::{InitHashMaps, PlHashSet};

use super::IfThenElseKernel;
use crate::if_then_else::scalar::if_then_else_broadcast_both_scalar_64;
Expand All @@ -28,12 +29,25 @@ fn make_buffer_and_views<const N: usize>(
(views, buf)
}

fn has_duplicate_buffers(bufs: &[Buffer<u8>]) -> bool {
let mut has_duplicate_buffers = false;
let mut bufset = PlHashSet::new();
for buf in bufs {
if !bufset.insert(buf.as_ptr()) {
has_duplicate_buffers = true;
break;
}
}
has_duplicate_buffers
}

impl IfThenElseKernel for BinaryViewArray {
type Scalar<'a> = &'a [u8];

fn if_then_else(mask: &Bitmap, if_true: &Self, if_false: &Self) -> Self {
let combined_buffers: Arc<_>;
let false_buffer_idx_offset: u32;
let mut has_duplicate_bufs = false;
if Arc::ptr_eq(if_true.data_buffers(), if_false.data_buffers()) {
// Share exact same buffers, no need to combine.
combined_buffers = if_true.data_buffers().clone();
Expand All @@ -42,7 +56,9 @@ impl IfThenElseKernel for BinaryViewArray {
// Put false buffers after true buffers.
let true_buffers = if_true.data_buffers().iter().cloned();
let false_buffers = if_false.data_buffers().iter().cloned();

combined_buffers = true_buffers.chain(false_buffers).collect();
has_duplicate_bufs = has_duplicate_buffers(&combined_buffers);
false_buffer_idx_offset = if_true.data_buffers().len() as u32;
}

Expand All @@ -57,12 +73,19 @@ impl IfThenElseKernel for BinaryViewArray {
let validity = super::if_then_else_validity(mask, if_true.validity(), if_false.validity());

let mut builder = MutablePlBinary::with_capacity(views.len());
unsafe {
builder.extend_non_null_views_trusted_len_unchecked(
views.into_iter(),
combined_buffers.deref(),
)
};

if has_duplicate_bufs {
unsafe {
builder.extend_non_null_views_unchecked_dedupe(
views.into_iter(),
combined_buffers.deref(),
)
};
} else {
unsafe {
builder.extend_non_null_views_unchecked(views.into_iter(), combined_buffers.deref())
};
}
builder
.freeze_with_dtype(if_true.data_type().clone())
.with_validity(validity)
Expand Down Expand Up @@ -90,12 +113,17 @@ impl IfThenElseKernel for BinaryViewArray {
let validity = super::if_then_else_validity(mask, None, if_false.validity());

let mut builder = MutablePlBinary::with_capacity(views.len());

unsafe {
builder.extend_non_null_views_trusted_len_unchecked(
views.into_iter(),
combined_buffers.deref(),
)
};
if has_duplicate_buffers(&combined_buffers) {
builder.extend_non_null_views_unchecked_dedupe(
views.into_iter(),
combined_buffers.deref(),
)
} else {
builder.extend_non_null_views_unchecked(views.into_iter(), combined_buffers.deref())
}
}
builder
.freeze_with_dtype(if_false.data_type().clone())
.with_validity(validity)
Expand Down Expand Up @@ -125,10 +153,14 @@ impl IfThenElseKernel for BinaryViewArray {

let mut builder = MutablePlBinary::with_capacity(views.len());
unsafe {
builder.extend_non_null_views_trusted_len_unchecked(
views.into_iter(),
combined_buffers.deref(),
)
if has_duplicate_buffers(&combined_buffers) {
builder.extend_non_null_views_unchecked_dedupe(
views.into_iter(),
combined_buffers.deref(),
)
} else {
builder.extend_non_null_views_unchecked(views.into_iter(), combined_buffers.deref())
}
};
builder
.freeze_with_dtype(if_true.data_type().clone())
Expand All @@ -152,7 +184,11 @@ impl IfThenElseKernel for BinaryViewArray {

let mut builder = MutablePlBinary::with_capacity(views.len());
unsafe {
builder.extend_non_null_views_trusted_len_unchecked(views.into_iter(), buffers.deref())
if has_duplicate_buffers(&buffers) {
builder.extend_non_null_views_unchecked_dedupe(views.into_iter(), buffers.deref())
} else {
builder.extend_non_null_views_unchecked(views.into_iter(), buffers.deref())
}
};
builder.freeze_with_dtype(dtype)
}
Expand Down