Skip to content

Commit

Permalink
feat: Add support for binary size method to Expr and Series "bin" n…
Browse files Browse the repository at this point in the history
…amespace
  • Loading branch information
alexander-beedie committed Jul 29, 2024
1 parent 9c29683 commit 1706c12
Show file tree
Hide file tree
Showing 12 changed files with 136 additions and 13 deletions.
10 changes: 10 additions & 0 deletions crates/polars-arrow/src/legacy/kernels/binary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
use crate::array::{Array, ArrayRef, BinaryViewArray, UInt32Array};
use crate::buffer::Buffer;
use crate::datatypes::ArrowDataType;

pub fn binary_size_bytes(array: &BinaryViewArray) -> ArrayRef {
let values = array.len_iter().collect::<Vec<_>>();
let values: Buffer<_> = values.into();
let array = UInt32Array::new(ArrowDataType::UInt32, values, array.validity().cloned());
Box::new(array)
}
1 change: 1 addition & 0 deletions crates/polars-arrow/src/legacy/kernels/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::iter::Enumerate;
use crate::array::BooleanArray;
use crate::bitmap::utils::BitChunks;
pub mod atan2;
pub mod binary;
pub mod concatenate;
pub mod ewm;
#[cfg(feature = "compute_take")]
Expand Down
7 changes: 7 additions & 0 deletions crates/polars-ops/src/chunked_array/binary/namespace.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#[cfg(feature = "binary_encoding")]
use std::borrow::Cow;

use arrow::legacy::kernels::binary::*;
#[cfg(feature = "binary_encoding")]
use base64::engine::general_purpose;
#[cfg(feature = "binary_encoding")]
Expand Down Expand Up @@ -69,6 +70,12 @@ pub trait BinaryNameSpaceImpl: AsBinary {
}
}

/// Get the size of the binary values in bytes.
fn size_bytes(&self) -> UInt32Chunked {
let ca = self.as_binary();
ca.apply_kernel_cast(&binary_size_bytes)
}

#[cfg(feature = "binary_encoding")]
fn hex_decode(&self, strict: bool) -> PolarsResult<BinaryChunked> {
let ca = self.as_binary();
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-plan/src/dsl/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ impl BinaryNameSpace {
)
}

/// Return the size (number of bytes) in each element.
pub fn size_bytes(self) -> Expr {
self.0
.map_private(FunctionExpr::BinaryExpr(BinaryFunction::Size))
}

#[cfg(feature = "binary_encoding")]
pub fn hex_decode(self, strict: bool) -> Expr {
self.0
Expand Down
9 changes: 9 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pub enum BinaryFunction {
Base64Decode(bool),
#[cfg(feature = "binary_encoding")]
Base64Encode,
Size,
}

impl BinaryFunction {
Expand All @@ -32,6 +33,7 @@ impl BinaryFunction {
HexDecode(_) | Base64Decode(_) => mapper.with_same_dtype(),
#[cfg(feature = "binary_encoding")]
HexEncode | Base64Encode => mapper.with_dtype(DataType::String),
Size => mapper.with_dtype(DataType::UInt32),
}
}
}
Expand All @@ -51,6 +53,7 @@ impl Display for BinaryFunction {
Base64Decode(_) => "base64_decode",
#[cfg(feature = "binary_encoding")]
Base64Encode => "base64_encode",
Size => "size_bytes",
};
write!(f, "bin.{s}")
}
Expand All @@ -77,6 +80,7 @@ impl From<BinaryFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
Base64Decode(strict) => map!(base64_decode, strict),
#[cfg(feature = "binary_encoding")]
Base64Encode => map!(base64_encode),
Size => map!(size_bytes),
}
}
}
Expand Down Expand Up @@ -107,6 +111,11 @@ pub(super) fn starts_with(s: &[Series]) -> PolarsResult<Series> {
.into_series())
}

pub(super) fn size_bytes(s: &Series) -> PolarsResult<Series> {
let ca = s.binary()?;
Ok(ca.size_bytes().into_series())
}

#[cfg(feature = "binary_encoding")]
pub(super) fn hex_decode(s: &Series, strict: bool) -> PolarsResult<Series> {
let ca = s.binary()?;
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ The following methods are available under the `expr.bin` attribute.
Expr.bin.decode
Expr.bin.encode
Expr.bin.ends_with
Expr.bin.size
Expr.bin.starts_with
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ The following methods are available under the `Series.bin` attribute.
Series.bin.decode
Series.bin.encode
Series.bin.ends_with
Series.size
Series.bin.starts_with
13 changes: 11 additions & 2 deletions py-polars/polars/_utils/various.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Literal,
Sequence,
TypeVar,
overload,
)

import polars as pl
Expand All @@ -39,7 +40,7 @@
if TYPE_CHECKING:
from collections.abc import Iterator, Reversible

from polars import DataFrame
from polars import DataFrame, Expr
from polars._typing import PolarsDataType, SizeUnit

if sys.version_info >= (3, 13):
Expand Down Expand Up @@ -221,7 +222,15 @@ def ordered_unique(values: Sequence[Any]) -> list[Any]:
return [v for v in values if not (v in seen or add_(v))]


def scale_bytes(sz: int, unit: SizeUnit) -> int | float:
@overload
def scale_bytes(sz: int, unit: SizeUnit) -> int | float: ...


@overload
def scale_bytes(sz: Expr, unit: SizeUnit) -> Expr: ...


def scale_bytes(sz: int | Expr, unit: SizeUnit) -> int | float | Expr:
"""Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb")."""
if unit in {"b", "bytes"}:
return sz
Expand Down
36 changes: 35 additions & 1 deletion py-polars/polars/expr/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from typing import TYPE_CHECKING

from polars._utils.parse import parse_into_expression
from polars._utils.various import scale_bytes
from polars._utils.wrap import wrap_expr

if TYPE_CHECKING:
from polars import Expr
from polars._typing import IntoExpr, TransferEncoding
from polars._typing import IntoExpr, SizeUnit, TransferEncoding


class ExprBinaryNameSpace:
Expand Down Expand Up @@ -251,3 +252,36 @@ def encode(self, encoding: TransferEncoding) -> Expr:
else:
msg = f"`encoding` must be one of {{'hex', 'base64'}}, got {encoding!r}"
raise ValueError(msg)

def size(self, unit: SizeUnit = "b") -> Expr:
r"""
Get the size of binary values in the given unit.
Returns
-------
Expr
Expression of data type :class:`UInt32`.
Examples
--------
>>> from os import urandom
>>> df = pl.DataFrame({"data": [urandom(n) for n in (512, 256, 2560, 1024)]})
>>> df.with_columns( # doctest: +IGNORE_RESULT
... n_bytes=pl.col("data").bin.size(),
... n_kilobytes=pl.col("data").bin.size("kb"),
... )
shape: (4, 3)
┌─────────────────────────────────┬─────────┬─────────────┐
│ data ┆ n_bytes ┆ n_kilobytes │
│ --- ┆ --- ┆ --- │
│ binary ┆ u32 ┆ f64 │
╞═════════════════════════════════╪═════════╪═════════════╡
│ b"y?~B\x83\xf4V\x07\xd3\xfb\xb… ┆ 512 ┆ 0.5 │
│ b"\xee$4@f\xc14\x07\x8e\x88\x1… ┆ 256 ┆ 0.25 │
│ b"~\x17\x9c\xb1\xf4\xdb?\xe9\x… ┆ 2560 ┆ 2.5 │
│ b"\x80\xbd\xb9nEq;2\x99$\xf9\x… ┆ 1024 ┆ 1.0 │
└─────────────────────────────────┴─────────┴─────────────┘
"""
sz = wrap_expr(self._pyexpr.bin_size_bytes())
sz = scale_bytes(sz, unit)
return sz
26 changes: 25 additions & 1 deletion py-polars/polars/series/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

if TYPE_CHECKING:
from polars import Series
from polars._typing import IntoExpr, TransferEncoding
from polars._typing import IntoExpr, SizeUnit, TransferEncoding
from polars.polars import PySeries


Expand Down Expand Up @@ -185,3 +185,27 @@ def encode(self, encoding: TransferEncoding) -> Series:
"AAD/"
]
"""

def size(self, unit: SizeUnit = "b") -> Series:
r"""
Get the size of the binary values in a Series in the given unit.
Returns
-------
Series
Series of data type :class:`UInt32`.
Examples
--------
>>> from os import urandom
>>> s = pl.Series("data", [urandom(n) for n in (512, 256, 2560, 1024)])
>>> s.bin.size("kb")
shape: (4,)
Series: 'data' [f64]
[
0.5
0.25
2.5
1.0
]
"""
4 changes: 4 additions & 0 deletions py-polars/src/expr/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,8 @@ impl PyExpr {
fn bin_base64_encode(&self) -> Self {
self.inner.clone().binary().base64_encode().into()
}

fn bin_size_bytes(&self) -> Self {
self.inner.clone().binary().size_bytes().into()
}
}
35 changes: 26 additions & 9 deletions py-polars/tests/unit/operations/namespaces/test_binary.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

import polars as pl
from polars._typing import TransferEncoding
from polars.testing import assert_frame_equal

if TYPE_CHECKING:
from polars._typing import SizeUnit, TransferEncoding


def test_binary_conversions() -> None:
df = pl.DataFrame({"blob": [b"abc", None, b"cde"]}).with_columns(
Expand Down Expand Up @@ -119,31 +125,42 @@ def test_hex_decode() -> None:

@pytest.mark.parametrize(
"encoding",
[
"hex",
"base64",
],
["hex", "base64"],
)
def test_compare_encode_between_lazy_and_eager_6814(encoding: TransferEncoding) -> None:
df = pl.DataFrame({"x": [b"aa", b"bb", b"cc"]})
expr = pl.col("x").bin.encode(encoding)

result_eager = df.select(expr)
dtype = result_eager["x"].dtype

result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()
assert_frame_equal(result_eager, result_lazy)


@pytest.mark.parametrize(
"encoding",
[
"hex",
"base64",
],
["hex", "base64"],
)
def test_compare_decode_between_lazy_and_eager_6814(encoding: TransferEncoding) -> None:
df = pl.DataFrame({"x": [b"d3d3", b"abcd", b"1234"]})
expr = pl.col("x").bin.decode(encoding)

result_eager = df.select(expr)
dtype = result_eager["x"].dtype

result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()
assert_frame_equal(result_eager, result_lazy)


@pytest.mark.parametrize(
("sz", "unit", "expected"),
[(128, "b", 128), (512, "kb", 0.5), (131072, "mb", 0.125)],
)
def test_binary_size(sz: int, unit: SizeUnit, expected: int | float) -> None:
df = pl.DataFrame({"data": [b"\x00" * sz]}, schema={"data": pl.Binary})
for sz in (
df.select(sz=pl.col("data").bin.size(unit)).item(), # expr
df["data"].bin.size(unit).item(), # series
):
assert sz == expected

0 comments on commit 1706c12

Please sign in to comment.