Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add support for binary size method to Expr and Series "bin" namespace #17924

Merged
merged 4 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/polars-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
use arrow::types::NativeType;

pub mod arithmetic;
pub mod arity;
pub mod comparisons;
pub mod filter;
pub mod float_sum;
pub mod if_then_else;
pub mod min_max;
pub mod size;
pub mod unique;

pub mod arity;

// Trait to enable the scalar blanket implementation.
pub trait NotSimdPrimitive: NativeType {}

Expand Down
9 changes: 9 additions & 0 deletions crates/polars-compute/src/size.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
use arrow::array::{Array, ArrayRef, BinaryViewArray, UInt32Array};
use arrow::buffer::Buffer;
use arrow::datatypes::ArrowDataType;

pub fn binary_size_bytes(array: &BinaryViewArray) -> ArrayRef {
let values: Buffer<_> = array.len_iter().collect();
let array = UInt32Array::new(ArrowDataType::UInt32, values, array.validity().cloned());
Box::new(array)
}
7 changes: 7 additions & 0 deletions crates/polars-ops/src/chunked_array/binary/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use base64::engine::general_purpose;
#[cfg(feature = "binary_encoding")]
use base64::Engine as _;
use memchr::memmem::find;
use polars_compute::size::binary_size_bytes;
use polars_core::prelude::arity::{broadcast_binary_elementwise_values, unary_elementwise_values};

use super::*;
Expand Down Expand Up @@ -69,6 +70,12 @@ pub trait BinaryNameSpaceImpl: AsBinary {
}
}

/// Get the size of the binary values in bytes.
fn size_bytes(&self) -> UInt32Chunked {
let ca = self.as_binary();
ca.apply_kernel_cast(&binary_size_bytes)
}

#[cfg(feature = "binary_encoding")]
fn hex_decode(&self, strict: bool) -> PolarsResult<BinaryChunked> {
let ca = self.as_binary();
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-plan/src/dsl/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ impl BinaryNameSpace {
)
}

/// Return the size (number of bytes) in each element.
pub fn size_bytes(self) -> Expr {
self.0
.map_private(FunctionExpr::BinaryExpr(BinaryFunction::Size))
}

#[cfg(feature = "binary_encoding")]
pub fn hex_decode(self, strict: bool) -> Expr {
self.0
Expand Down
13 changes: 10 additions & 3 deletions crates/polars-plan/src/dsl/function_expr/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
use serde::{Deserialize, Serialize};

use super::*;
#[cfg(feature = "binary_encoding")]
use crate::map;
use crate::map_as_slice;
use crate::{map, map_as_slice};

#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
Expand All @@ -20,6 +18,7 @@ pub enum BinaryFunction {
Base64Decode(bool),
#[cfg(feature = "binary_encoding")]
Base64Encode,
Size,
}

impl BinaryFunction {
Expand All @@ -32,6 +31,7 @@ impl BinaryFunction {
HexDecode(_) | Base64Decode(_) => mapper.with_same_dtype(),
#[cfg(feature = "binary_encoding")]
HexEncode | Base64Encode => mapper.with_dtype(DataType::String),
Size => mapper.with_dtype(DataType::UInt32),
}
}
}
Expand All @@ -51,6 +51,7 @@ impl Display for BinaryFunction {
Base64Decode(_) => "base64_decode",
#[cfg(feature = "binary_encoding")]
Base64Encode => "base64_encode",
Size => "size_bytes",
};
write!(f, "bin.{s}")
}
Expand All @@ -77,6 +78,7 @@ impl From<BinaryFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
Base64Decode(strict) => map!(base64_decode, strict),
#[cfg(feature = "binary_encoding")]
Base64Encode => map!(base64_encode),
Size => map!(size_bytes),
}
}
}
Expand Down Expand Up @@ -107,6 +109,11 @@ pub(super) fn starts_with(s: &[Series]) -> PolarsResult<Series> {
.into_series())
}

pub(super) fn size_bytes(s: &Series) -> PolarsResult<Series> {
let ca = s.binary()?;
Ok(ca.size_bytes().into_series())
}

#[cfg(feature = "binary_encoding")]
pub(super) fn hex_decode(s: &Series, strict: bool) -> PolarsResult<Series> {
let ca = s.binary()?;
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ The following methods are available under the `expr.bin` attribute.
Expr.bin.decode
Expr.bin.encode
Expr.bin.ends_with
Expr.bin.size
Expr.bin.starts_with
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ The following methods are available under the `Series.bin` attribute.
Series.bin.decode
Series.bin.encode
Series.bin.ends_with
Series.bin.size
Series.bin.starts_with
13 changes: 11 additions & 2 deletions py-polars/polars/_utils/various.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Literal,
Sequence,
TypeVar,
overload,
)

import polars as pl
Expand All @@ -39,7 +40,7 @@
if TYPE_CHECKING:
from collections.abc import Iterator, Reversible

from polars import DataFrame
from polars import DataFrame, Expr
from polars._typing import PolarsDataType, SizeUnit

if sys.version_info >= (3, 13):
Expand Down Expand Up @@ -221,7 +222,15 @@ def ordered_unique(values: Sequence[Any]) -> list[Any]:
return [v for v in values if not (v in seen or add_(v))]


def scale_bytes(sz: int, unit: SizeUnit) -> int | float:
@overload
def scale_bytes(sz: int, unit: SizeUnit) -> int | float: ...


@overload
def scale_bytes(sz: Expr, unit: SizeUnit) -> Expr: ...


def scale_bytes(sz: int | Expr, unit: SizeUnit) -> int | float | Expr:
"""Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb")."""
if unit in {"b", "bytes"}:
return sz
Expand Down
36 changes: 35 additions & 1 deletion py-polars/polars/expr/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from typing import TYPE_CHECKING

from polars._utils.parse import parse_into_expression
from polars._utils.various import scale_bytes
from polars._utils.wrap import wrap_expr

if TYPE_CHECKING:
from polars import Expr
from polars._typing import IntoExpr, TransferEncoding
from polars._typing import IntoExpr, SizeUnit, TransferEncoding


class ExprBinaryNameSpace:
Expand Down Expand Up @@ -251,3 +252,36 @@ def encode(self, encoding: TransferEncoding) -> Expr:
else:
msg = f"`encoding` must be one of {{'hex', 'base64'}}, got {encoding!r}"
raise ValueError(msg)

def size(self, unit: SizeUnit = "b") -> Expr:
r"""
Get the size of binary values in the given unit.

Returns
-------
Expr
Expression of data type :class:`UInt32`.

Examples
--------
>>> from os import urandom
>>> df = pl.DataFrame({"data": [urandom(n) for n in (512, 256, 2560, 1024)]})
>>> df.with_columns( # doctest: +IGNORE_RESULT
... n_bytes=pl.col("data").bin.size(),
... n_kilobytes=pl.col("data").bin.size("kb"),
... )
shape: (4, 3)
┌─────────────────────────────────┬─────────┬─────────────┐
│ data ┆ n_bytes ┆ n_kilobytes │
│ --- ┆ --- ┆ --- │
│ binary ┆ u32 ┆ f64 │
╞═════════════════════════════════╪═════════╪═════════════╡
│ b"y?~B\x83\xf4V\x07\xd3\xfb\xb… ┆ 512 ┆ 0.5 │
│ b"\xee$4@f\xc14\x07\x8e\x88\x1… ┆ 256 ┆ 0.25 │
│ b"~\x17\x9c\xb1\xf4\xdb?\xe9\x… ┆ 2560 ┆ 2.5 │
│ b"\x80\xbd\xb9nEq;2\x99$\xf9\x… ┆ 1024 ┆ 1.0 │
└─────────────────────────────────┴─────────┴─────────────┘
"""
sz = wrap_expr(self._pyexpr.bin_size_bytes())
sz = scale_bytes(sz, unit)
return sz
26 changes: 25 additions & 1 deletion py-polars/polars/series/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

if TYPE_CHECKING:
from polars import Series
from polars._typing import IntoExpr, TransferEncoding
from polars._typing import IntoExpr, SizeUnit, TransferEncoding
from polars.polars import PySeries


Expand Down Expand Up @@ -185,3 +185,27 @@ def encode(self, encoding: TransferEncoding) -> Series:
"AAD/"
]
"""

def size(self, unit: SizeUnit = "b") -> Series:
r"""
Get the size of the binary values in a Series in the given unit.

Returns
-------
Series
Series of data type :class:`UInt32`.

Examples
--------
>>> from os import urandom
>>> s = pl.Series("data", [urandom(n) for n in (512, 256, 2560, 1024)])
>>> s.bin.size("kb")
shape: (4,)
Series: 'data' [f64]
[
0.5
0.25
2.5
1.0
]
"""
4 changes: 4 additions & 0 deletions py-polars/src/expr/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,8 @@ impl PyExpr {
fn bin_base64_encode(&self) -> Self {
self.inner.clone().binary().base64_encode().into()
}

fn bin_size_bytes(&self) -> Self {
self.inner.clone().binary().size_bytes().into()
}
}
35 changes: 26 additions & 9 deletions py-polars/tests/unit/operations/namespaces/test_binary.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

import polars as pl
from polars._typing import TransferEncoding
from polars.testing import assert_frame_equal

if TYPE_CHECKING:
from polars._typing import SizeUnit, TransferEncoding


def test_binary_conversions() -> None:
df = pl.DataFrame({"blob": [b"abc", None, b"cde"]}).with_columns(
Expand Down Expand Up @@ -119,31 +125,42 @@ def test_hex_decode() -> None:

@pytest.mark.parametrize(
"encoding",
[
"hex",
"base64",
],
["hex", "base64"],
)
def test_compare_encode_between_lazy_and_eager_6814(encoding: TransferEncoding) -> None:
df = pl.DataFrame({"x": [b"aa", b"bb", b"cc"]})
expr = pl.col("x").bin.encode(encoding)

result_eager = df.select(expr)
dtype = result_eager["x"].dtype

result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()
assert_frame_equal(result_eager, result_lazy)


@pytest.mark.parametrize(
"encoding",
[
"hex",
"base64",
],
["hex", "base64"],
)
def test_compare_decode_between_lazy_and_eager_6814(encoding: TransferEncoding) -> None:
df = pl.DataFrame({"x": [b"d3d3", b"abcd", b"1234"]})
expr = pl.col("x").bin.decode(encoding)

result_eager = df.select(expr)
dtype = result_eager["x"].dtype

result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()
assert_frame_equal(result_eager, result_lazy)


@pytest.mark.parametrize(
("sz", "unit", "expected"),
[(128, "b", 128), (512, "kb", 0.5), (131072, "mb", 0.125)],
)
def test_binary_size(sz: int, unit: SizeUnit, expected: int | float) -> None:
df = pl.DataFrame({"data": [b"\x00" * sz]}, schema={"data": pl.Binary})
for sz in (
df.select(sz=pl.col("data").bin.size(unit)).item(), # expr
df["data"].bin.size(unit).item(), # series
):
assert sz == expected
Loading