From 4512d261a14c846b16d0736c11d3f33cdd0e9542 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Fri, 29 Sep 2023 11:10:21 +0300 Subject: [PATCH 01/14] begining --- src/rust/src/rlib.rs | 18 ++++++++++++++++++ src/rust/src/utils/mod.rs | 25 +++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index e5ae8d5ca..8bff98dd3 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -3,10 +3,12 @@ use crate::lazy::dsl::ProtoExprArray; use crate::rdataframe::DataFrame; use crate::robj_to; +use crate::rdataframe::LazyFrame; use crate::rpolarserr::{rdbg, RResult}; use crate::series::Series; use crate::{rdataframe::VecDataFrame, utils::r_result_list}; use extendr_api::prelude::*; +use polars::lazy::dsl; use polars::prelude as pl; use polars_core::functions as pl_functions; use std::result::Result; @@ -44,6 +46,21 @@ fn concat_df(vdf: &VecDataFrame) -> List { r_result_list(result.map_err(|err| format!("{:?}", err))) } +#[extendr] +fn concat_lf(l: Robj, rechunk: bool, parallel: bool, to_supertypes: bool) -> RResult { + let vlf = robj_to!(Vec, PLLazyFrame, l)?; + dsl::concat( + vlf, + pl::UnionArgs { + parallel, + rechunk, + to_supertypes, + }, + ) + .map_err(polars_to_rpolars_err) + .map(LazyFrame) +} + #[extendr] fn diag_concat_df(dfs: &VecDataFrame) -> List { let df = pl_functions::diag_concat_df(&dfs.0[..]).map(DataFrame); @@ -278,6 +295,7 @@ fn polars_features() -> List { extendr_module! { mod rlib; fn concat_df; + fn concat_lf; fn hor_concat_df; fn diag_concat_df; fn min_exprs; diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index 5b3a0e87b..e9186ce0f 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -6,10 +6,12 @@ pub mod wrappers; use crate::lazy::dsl::Expr; use crate::rdatatype::RPolarsDataType; use crate::rpolarserr::{rdbg, rerr, RPolarsErr, RResult, WithRctx}; + use extendr_api::prelude::list; use std::any::type_name as tn; //use std::intrinsics::read_via_copy; use crate::lazy::dsl::robj_to_col; +use crate::rdataframe::{DataFrame, LazyFrame}; use extendr_api::Attributes; use extendr_api::ExternalPtr; use extendr_api::Result as ExtendrResult; @@ -742,8 +744,23 @@ fn internal_rust_wrap_e(robj: Robj, str_to_lit: bool) -> RResult { pub fn robj_to_lazyframe(robj: extendr_api::Robj) -> RResult { let robj = unpack_r_result_list(robj)?; let rv = rdbg(&robj); - use crate::rdataframe::LazyFrame; - let res: Result, _> = robj.try_into(); + + // closure to allow ?-convert extendr::Result to RResult + let res = || -> RResult { + match () { + // allow input as a DataFrame + _ if robj.inherits("DataFrame") => { + let extptr_df: ExternalPtr = robj.try_into()?; + Ok(extptr_df.lazy()) + } + _ => { + let lf: ExternalPtr = robj.try_into()?; + let lf = LazyFrame(lf.0.clone()); + Ok(lf) + } + } + }(); + let ext_ldf = res.bad_val(rv).mistyped(tn::())?; Ok(LazyFrame(ext_ldf.0.clone())) } @@ -895,6 +912,10 @@ macro_rules! robj_to_inner { $crate::utils::robj_to_lazyframe($a) }; + (PLLazyFrame, $a:ident) => { + $crate::utils::robj_to_lazyframe($a).map(|lf| lf.0) + }; + (RArrow_schema, $a:ident) => { $crate::utils::robj_to_rarrow_schema($a) }; From 1b2b9fb8e6fb0a46e03fad6ff472e2263eb62e9d Mon Sep 17 00:00:00 2001 From: sorhawell Date: Sun, 1 Oct 2023 01:29:52 +0300 Subject: [PATCH 02/14] almost there --- R/error__rpolarserr.R | 4 +- R/extendr-wrappers.R | 30 ++++--- R/functions__eager.R | 78 ++++++++++++------ R/lazyframe__lazy.R | 1 + R/series__series.R | 2 +- man/Series_dtype.Rd | 11 --- man/Series_flags.Rd | 21 +++++ man/pl_concat.Rd | 4 +- src/rust/src/concat.rs | 139 +++++++++++++++++++++++++++++++++ src/rust/src/lib.rs | 3 + src/rust/src/rdataframe/mod.rs | 3 +- src/rust/src/rlib.rs | 70 +---------------- src/rust/src/utils/mod.rs | 73 +++++++++++++++-- 13 files changed, 310 insertions(+), 129 deletions(-) create mode 100644 man/Series_flags.Rd create mode 100644 src/rust/src/concat.rs diff --git a/R/error__rpolarserr.R b/R/error__rpolarserr.R index 37a86b041..af8cd83be 100644 --- a/R/error__rpolarserr.R +++ b/R/error__rpolarserr.R @@ -53,8 +53,8 @@ bad_robj = function(r) { .pr$RPolarsErr$new()$bad_robj(r) } -Err_plain = function(x) { - Err(.pr$RPolarsErr$new()$plain(x)) +Err_plain = function(...) { + Err(.pr$RPolarsErr$new()$plain(paste(..., collapse=" "))) } # short hand for extracting an error context in unit testing, will raise error if not an RPolarsErr diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 91338ef9a..2cbe30350 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -11,18 +11,6 @@ #' @useDynLib polars, .registration = TRUE NULL -rlazy_csv_reader <- function(path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) .Call(wrap__rlazy_csv_reader, path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) - -import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_count, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_count, memmap) - -new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) - -concat_df <- function(vdf) .Call(wrap__concat_df, vdf) - -hor_concat_df <- function(dfs) .Call(wrap__hor_concat_df, dfs) - -diag_concat_df <- function(dfs) .Call(wrap__diag_concat_df, dfs) - min_exprs <- function(exprs) .Call(wrap__min_exprs, exprs) max_exprs <- function(exprs) .Call(wrap__max_exprs, exprs) @@ -75,6 +63,24 @@ test_wrong_call_pl_lit <- function(robj) .Call(wrap__test_wrong_call_pl_lit, rob polars_features <- function() .Call(wrap__polars_features) +concat_df <- function(vdf) .Call(wrap__concat_df, vdf) + +concat_lf <- function(l, rechunk, parallel, to_supertypes) .Call(wrap__concat_lf, l, rechunk, parallel, to_supertypes) + +diag_concat_df <- function(dfs) .Call(wrap__diag_concat_df, dfs) + +diag_concat_lf <- function(l, rechunk, parallel) .Call(wrap__diag_concat_lf, l, rechunk, parallel) + +hor_concat_df <- function(l) .Call(wrap__hor_concat_df, l) + +concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, l, rechunk, to_supertypes) + +rlazy_csv_reader <- function(path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) .Call(wrap__rlazy_csv_reader, path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) + +import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_count, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_count, memmap) + +new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) + test_rpolarserr <- function() .Call(wrap__test_rpolarserr) setup_renv <- function() .Call(wrap__setup_renv) diff --git a/R/functions__eager.R b/R/functions__eager.R index 7e98573c8..e1c5a7a30 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -3,7 +3,8 @@ #' @param l list of DataFrame, or Series, LazyFrame or Expr #' @param rechunk perform a rechunk at last #' @param how choice of bind direction "vertical"(rbind) "horizontal"(cbind) "diagonal" diagonally -#' @param parallel BOOL default TRUE, only used for LazyFrames +#' @param parallel Boolean default TRUE, only used for LazyFrames +#' @param to_supertypes Boolean default TRUE, cast columns shared super types, if any. #' #' @details #' Categorical columns/Series must have been constructed while global string cache enabled @@ -37,41 +38,68 @@ #' # diagonal #' pl$concat(l_hor, how = "diagonal") pl$concat = function( - l, # list of DataFrames or Series or lazyFrames or expr + ..., # list of DataFrames or Series or lazyFrames or expr rechunk = TRUE, - how = c("vertical", "horizontal", "diagonal"), - parallel = TRUE # not used yet + how = c("vertical", "horizontal", "diagonal"),#, "vertical_relaxed","diangonal_relaxed"), + parallel = TRUE, + eager = FALSE, + to_supertypes = FALSE ) { + browser() + l = unpack_list(...) ## Check inputs - how = match.arg(how[1L], c("vertical", "horizontal", "diagonal")) + how_args = c("vertical", "horizontal", "diagonal")#, "vertical_relaxed", "diangonal_relaxed") + + how = match.arg(how[1L], how_args) |> + result() |> + unwrap("in pl$concat()") + + first = l[[1]] + eager = isTRUE(eager) #!inherits(first,"LazyFrame") + args_modified = names(as.list(sys.call()[-1])) # dispatch on item class and how - first = l[[1L]] - result = pcase( - inherits(first, "DataFrame"), - { - vdf = l_to_vdf(l) - pcase( - how == "vertical", concat_df(vdf), - how == "diagonal", diag_concat_df(vdf), - how == "horizontal", hor_concat_df(vdf), - or_else = stopf("Internal error") - ) + + pcase( + how == "vertical" && (inherits(first, "Series") || is.vector(first)), { + if(any(args_modified %in% c("eager","parallel", "to_super_types", "rechunk"))) { + warning( + "args: parallel and eager takes no effect when concat Series" + ) + } + concat_series(l, rechunk, to_supertypes) }, - inherits(first, "Series"), - { - stopf("not implemented Series") + + how == "vertical", concat_lf(l, rechunk, parallel, to_supertypes), + how == "diagonal", diag_concat_lf(l, rechunk, parallel, to_supertypes), + + how == "horizontal" && !eager, { + Err_plain("how=='horizontal' is not supported for !eager. Try e.g. $join() .") }, - inherits(first, "Expr"), - { - stopf("not implemented Expr") + + how == "horizontal", { + if(any(args_modified %in% c("parallel", "to_super_types"))) { + warning( + "args parallel, rechunk, eager and to_supertypes", + "takes no effect for how=='horizontal'" + ) + } + concat_df(l) }, # TODO implement Series, Expr, Lazy etc - or_else = stopf(paste0("type of first list element: '", class(first), "' is not supported")) - ) + or_else = Err_plain("type of first list element: '", class(first), "' is not supported") - unwrap(result) + ) |> and_then(\(x) { + pcase( + inherits(x,"DataFrame") && !eager, Err_plain("internal logical error in pl$concat()"), + inherits(x,"LazyFrame") && eager, Ok(x$lazy()), + or_else = Ok(x) + ) + + }) |> unwrap( + "in pl$concat()" + ) } diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index febc64dca..f701ebde1 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -353,6 +353,7 @@ LazyFrame_collect = function( streaming ) |> and_then(collect_f) |> + unwrap("in $collect():") } diff --git a/R/series__series.R b/R/series__series.R index ec7c90993..18f7c6e35 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -763,7 +763,7 @@ Series_dtype = method_as_property(function() { #' @keywords Series #' @return DataType #' @aliases Series_flags -#' @name Series_dtype +#' @name Series_flags #' @details property sorted flags are not settable, use set_sorted #' @examples #' pl$Series(1:4)$sort()$flags diff --git a/man/Series_dtype.Rd b/man/Series_dtype.Rd index a29fa59a8..8d591f6e6 100644 --- a/man/Series_dtype.Rd +++ b/man/Series_dtype.Rd @@ -2,30 +2,19 @@ % Please edit documentation in R/series__series.R \name{Series_dtype} \alias{Series_dtype} -\alias{Series_flags} \title{Get data type of Series} \usage{ Series_dtype() - -Series_flags() } \value{ -DataType - DataType } \description{ Get data type of Series - -Get data type of Series -} -\details{ -property sorted flags are not settable, use set_sorted } \examples{ pl$Series(1:4)$dtype pl$Series(c(1, 2))$dtype pl$Series(letters)$dtype -pl$Series(1:4)$sort()$flags } \keyword{Series} diff --git a/man/Series_flags.Rd b/man/Series_flags.Rd new file mode 100644 index 000000000..d909d2c6a --- /dev/null +++ b/man/Series_flags.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/series__series.R +\name{Series_flags} +\alias{Series_flags} +\title{Get data type of Series} +\usage{ +Series_flags() +} +\value{ +DataType +} +\description{ +Get data type of Series +} +\details{ +property sorted flags are not settable, use set_sorted +} +\examples{ +pl$Series(1:4)$sort()$flags +} +\keyword{Series} diff --git a/man/pl_concat.Rd b/man/pl_concat.Rd index 288a30d37..732596535 100644 --- a/man/pl_concat.Rd +++ b/man/pl_concat.Rd @@ -10,7 +10,9 @@ \item{how}{choice of bind direction "vertical"(rbind) "horizontal"(cbind) "diagonal" diagonally} -\item{parallel}{BOOL default TRUE, only used for LazyFrames} +\item{parallel}{Boolean default TRUE, only used for LazyFrames} + +\item{to_supertypes}{Boolean default TRUE, cast columns shared super types, if any.} } \value{ DataFrame, or Series, LazyFrame or Expr diff --git a/src/rust/src/concat.rs b/src/rust/src/concat.rs new file mode 100644 index 000000000..a580a53e4 --- /dev/null +++ b/src/rust/src/concat.rs @@ -0,0 +1,139 @@ +use crate::rdataframe::DataFrame; +use crate::robj_to; + +use crate::rdataframe::LazyFrame; +use crate::rpolarserr::*; +use crate::series::Series; +use crate::{rdataframe::VecDataFrame, utils::r_result_list}; +use extendr_api::prelude::*; +use polars::lazy::dsl; +use polars::prelude as pl; +use polars_core; +use polars_core::functions as pl_functions; +use std::result::Result; + +#[extendr] +fn concat_df(vdf: &VecDataFrame) -> List { + //-> PyResult { + + use polars_core::error::PolarsResult; + use polars_core::utils::rayon::prelude::*; + + let identity_df = (*vdf.0.iter().peekable().peek().unwrap()) + .clone() + .slice(0, 0); + let rdfs: Vec> = + vdf.0.iter().map(|df| Ok(df.clone())).collect(); + let identity = || Ok(identity_df.clone()); + + let result = polars_core::POOL + .install(|| { + rdfs.into_par_iter() + .fold(identity, |acc: PolarsResult, df| { + let mut acc = acc?; + acc.vstack_mut(&df?)?; + Ok(acc) + }) + .reduce(identity, |acc, df| { + let mut acc = acc?; + acc.vstack_mut(&df?)?; + Ok(acc) + }) + }) + .map(DataFrame); + + r_result_list(result.map_err(|err| format!("{:?}", err))) +} + +#[extendr] +fn concat_lf(l: Robj, rechunk: bool, parallel: bool, to_supertypes: bool) -> RResult { + let vlf = robj_to!(Vec, PLLazyFrame, l)?; + dsl::concat( + vlf, + pl::UnionArgs { + parallel, + rechunk, + to_supertypes, + }, + ) + .map_err(polars_to_rpolars_err) + .map(LazyFrame) +} + +#[extendr] +fn diag_concat_df(dfs: Robj) -> RResult { + let df_vec = robj_to!(Vec, PLDataFrame, dfs)?; + pl_functions::diag_concat_df(&df_vec) + .map_err(polars_to_rpolars_err) + .map(DataFrame) +} + +#[extendr] +fn diag_concat_lf(l: Robj, rechunk: bool, parallel: bool) -> RResult { + let vlf = robj_to!(Vec, PLLazyFrame, l)?; + dsl::diag_concat_lf(vlf, rechunk, parallel) + .map_err(polars_to_rpolars_err) + .map(LazyFrame) +} + +#[extendr] +pub fn hor_concat_df(l: Robj) -> RResult { + let df_vec = robj_to!(Vec, PLDataFrame, l)?; + pl_functions::hor_concat_df(&df_vec) + .map_err(polars_to_rpolars_err) + .map(DataFrame) +} + +#[extendr] +pub fn concat_series(l: Robj, rechunk: Robj, to_supertypes: Robj) -> RResult { + let to_supertypes = robj_to!(bool, to_supertypes)?; + let mut s_vec = robj_to!(Vec, PLSeries, l)?; + + // find any common supertype and cast to it + if to_supertypes { + let shared_supertype: RResult> = s_vec + .iter() + .map(|s| s.dtype().clone()) + .fold(Ok(None), |acc, x| match acc { + Err(err) => Err(err), + Ok(None) => Ok(Some(x)), + Ok(Some(acc)) => polars_core::utils::get_supertype(&acc, &x) + .ok_or(RPolarsErr::new().plain("Series' have no common supertype".to_string())) + .map(|dt| Some(dt)), + }); + let shared_supertype = shared_supertype?.expect("cannot be None, unless empty s_vec"); + + for i in 0..s_vec.len() { + if *s_vec[i].dtype() != shared_supertype { + s_vec[i] = s_vec[i] + .cast(&shared_supertype) + .map_err(polars_to_rpolars_err)?; + }; + } + } + + let mut iter = s_vec.into_iter(); + let mut first_s = iter + .next() + .ok_or(RPolarsErr::new().plain("no series found to concatenate".into()))?; + for next_s in iter { + first_s.append(&next_s).map_err(polars_to_rpolars_err)?; + } + + if robj_to!(bool, rechunk)? { + Ok(first_s.rechunk().into()) + } else { + Ok(first_s.into()) + } +} + +extendr_module! { + mod concat; + + fn concat_df; + fn concat_lf; + fn diag_concat_df; + fn diag_concat_lf; + fn hor_concat_df; + fn concat_series; +} diff --git a/src/rust/src/lib.rs b/src/rust/src/lib.rs index 0c62951fb..a8198fbdd 100644 --- a/src/rust/src/lib.rs +++ b/src/rust/src/lib.rs @@ -13,6 +13,7 @@ pub mod concurrent; pub mod lazy; pub mod arrow_interop; +pub mod concat; pub mod conversion; pub mod conversion_r_to_s; pub mod conversion_s_to_r; @@ -42,6 +43,8 @@ pub use crate::rbackground::RBGPOOL; // Macro to generate exports extendr_module! { mod polars; + use rlib; + use concat; use rdataframe; use rpolarserr; use rbackground; diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 63f54048d..d98d4c53e 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -8,7 +8,6 @@ use crate::conversion_r_to_s::robjname2series; use crate::lazy; use crate::rdatatype; use crate::rdatatype::RPolarsDataType; -use crate::rlib; use crate::robj_to; use crate::rpolarserr::{polars_to_rpolars_err, RResult}; @@ -463,7 +462,7 @@ extendr_module! { use read_ipc; use read_parquet; use rdatatype; - use rlib; + impl DataFrame; impl VecDataFrame; } diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index 8bff98dd3..b467a6575 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -2,77 +2,12 @@ use crate::lazy::dsl::Expr; use crate::lazy::dsl::ProtoExprArray; use crate::rdataframe::DataFrame; use crate::robj_to; - -use crate::rdataframe::LazyFrame; use crate::rpolarserr::{rdbg, RResult}; use crate::series::Series; -use crate::{rdataframe::VecDataFrame, utils::r_result_list}; use extendr_api::prelude::*; -use polars::lazy::dsl; use polars::prelude as pl; -use polars_core::functions as pl_functions; use std::result::Result; -#[extendr] -fn concat_df(vdf: &VecDataFrame) -> List { - //-> PyResult { - - use polars_core::error::PolarsResult; - use polars_core::utils::rayon::prelude::*; - - let identity_df = (*vdf.0.iter().peekable().peek().unwrap()) - .clone() - .slice(0, 0); - let rdfs: Vec> = - vdf.0.iter().map(|df| Ok(df.clone())).collect(); - let identity = || Ok(identity_df.clone()); - - let result = polars_core::POOL - .install(|| { - rdfs.into_par_iter() - .fold(identity, |acc: PolarsResult, df| { - let mut acc = acc?; - acc.vstack_mut(&df?)?; - Ok(acc) - }) - .reduce(identity, |acc, df| { - let mut acc = acc?; - acc.vstack_mut(&df?)?; - Ok(acc) - }) - }) - .map(DataFrame); - - r_result_list(result.map_err(|err| format!("{:?}", err))) -} - -#[extendr] -fn concat_lf(l: Robj, rechunk: bool, parallel: bool, to_supertypes: bool) -> RResult { - let vlf = robj_to!(Vec, PLLazyFrame, l)?; - dsl::concat( - vlf, - pl::UnionArgs { - parallel, - rechunk, - to_supertypes, - }, - ) - .map_err(polars_to_rpolars_err) - .map(LazyFrame) -} - -#[extendr] -fn diag_concat_df(dfs: &VecDataFrame) -> List { - let df = pl_functions::diag_concat_df(&dfs.0[..]).map(DataFrame); - r_result_list(df.map_err(|err| format!("{:?}", err))) -} - -#[extendr] -pub fn hor_concat_df(dfs: &VecDataFrame) -> List { - let df = pl_functions::hor_concat_df(&dfs.0[..]).map(DataFrame); - r_result_list(df.map_err(|err| format!("{:?}", err))) -} - #[extendr] fn min_exprs(exprs: &ProtoExprArray) -> Expr { let exprs = exprs.to_vec("select"); @@ -294,10 +229,7 @@ fn polars_features() -> List { extendr_module! { mod rlib; - fn concat_df; - fn concat_lf; - fn hor_concat_df; - fn diag_concat_df; + fn min_exprs; fn max_exprs; fn coalesce_exprs; diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index e9186ce0f..6a158e3d5 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -3,10 +3,11 @@ pub mod extendr_concurrent; pub mod extendr_helpers; pub mod wrappers; +use crate::conversion_r_to_s::robjname2series; use crate::lazy::dsl::Expr; use crate::rdatatype::RPolarsDataType; -use crate::rpolarserr::{rdbg, rerr, RPolarsErr, RResult, WithRctx}; - +use crate::rpolarserr::{polars_to_rpolars_err, rdbg, rerr, RPolarsErr, RResult, WithRctx}; +use crate::series::Series; use extendr_api::prelude::list; use std::any::type_name as tn; //use std::intrinsics::read_via_copy; @@ -741,7 +742,7 @@ fn internal_rust_wrap_e(robj: Robj, str_to_lit: bool) -> RResult { } } -pub fn robj_to_lazyframe(robj: extendr_api::Robj) -> RResult { +pub fn robj_to_lazyframe(robj: extendr_api::Robj) -> RResult { let robj = unpack_r_result_list(robj)?; let rv = rdbg(&robj); @@ -753,16 +754,60 @@ pub fn robj_to_lazyframe(robj: extendr_api::Robj) -> RResult = robj.try_into()?; Ok(extptr_df.lazy()) } - _ => { + _ if robj.inherits("LazyFrame") => { let lf: ExternalPtr = robj.try_into()?; let lf = LazyFrame(lf.0.clone()); Ok(lf) } + _ => Ok(DataFrame::new_with_capacity(1) + .lazy() + .0 + .select(&[robj_to_rexpr(robj, true)?.0])) + .map(LazyFrame), } }(); - let ext_ldf = res.bad_val(rv).mistyped(tn::())?; - Ok(LazyFrame(ext_ldf.0.clone())) + res.bad_val(rv).mistyped(tn::()) +} + +pub fn robj_to_dataframe(robj: extendr_api::Robj) -> RResult { + let robj = unpack_r_result_list(robj)?; + let robj_clone = robj.clone(); + + // closure to allow ?-convert extendr::Result to RResult + let res = || -> RResult { + match () { + // allow input as a DataFrame + _ if robj.inherits("DataFrame") => { + let extptr_df: ExternalPtr = robj.try_into()?; + Ok(extptr_df.0.clone()) + } + _ if robj.inherits("LazyFrame") => { + let lf: ExternalPtr = robj.try_into()?; + lf.0.clone().collect() + } + _ => DataFrame::new_with_capacity(1) + .lazy() + .0 + .select(&[robj_to_rexpr(robj, true)?.0]) + .collect(), + } + .map(DataFrame) + .map_err(polars_to_rpolars_err) + }(); + + res.bad_val(rdbg(robj_clone)) + .plain("could not be converted into a DataFrame") +} + +pub fn robj_to_series(robj: extendr_api::Robj) -> RResult { + let robj = unpack_r_result_list(robj)?; + let robj_clone = robj.clone(); + robjname2series(robj, "") + .map(Series) + .map_err(polars_to_rpolars_err) + .bad_val(rdbg(robj_clone)) + .plain("could not be converted into a DataFrame") } pub fn list_expr_to_vec_pl_expr( @@ -865,6 +910,14 @@ macro_rules! robj_to_inner { $crate::utils::robj_to_binary_vec($a) }; + (Series, $a:ident) => { + $crate::utils::robj_to_series($a) + }; + + (PLSeries, $a:ident) => { + $crate::utils::robj_to_series($a).map(|ok| ok.0) + }; + (Expr, $a:ident) => { $crate::utils::robj_to_rexpr($a, true) }; @@ -916,6 +969,14 @@ macro_rules! robj_to_inner { $crate::utils::robj_to_lazyframe($a).map(|lf| lf.0) }; + (DataFrame, $a:ident) => { + $crate::utils::robj_to_dataframe($a) + }; + + (PLDataFrame, $a:ident) => { + $crate::utils::robj_to_dataframe($a).map(|lf| lf.0) + }; + (RArrow_schema, $a:ident) => { $crate::utils::robj_to_rarrow_schema($a) }; From d1234484e1c43e52ebdb400081f548ca2268c26e Mon Sep 17 00:00:00 2001 From: sorhawell Date: Sun, 1 Oct 2023 23:32:42 +0300 Subject: [PATCH 03/14] n_chunks + supertypes + unit tests + fix bugs --- R/dataframe__frame.R | 19 ++++++++++ R/error__rpolarserr.R | 9 ++++- R/extendr-wrappers.R | 6 +-- R/functions__eager.R | 57 +++++++++++++++++++--------- R/utils.R | 9 ++++- man/DataFrame_n_chunks.Rd | 27 +++++++++++++ man/nanoarrow.Rd | 8 ++-- src/rust/src/concat.rs | 47 +---------------------- src/rust/src/rdataframe/mod.rs | 21 ++++++++++- src/rust/src/utils/mod.rs | 12 ++++++ tests/testthat/test-concat.R | 67 ++++++++++++++++++++++++++++++--- tests/testthat/test-dataframe.R | 19 ++++++++++ 12 files changed, 219 insertions(+), 82 deletions(-) create mode 100644 man/DataFrame_n_chunks.Rd diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index f2375b91d..9058074cb 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1057,6 +1057,25 @@ DataFrame_first = function() { self$lazy()$first()$collect() } + +#' @title Get the number of chunks of the Series' in a DataFrame +#' @keywords DataFrame +#' @param strategy string either 'all' or 'first'. 'first' only returns chunks for first Series. +#' @return real vector of chunk counts per Series. +#' @examples +#' df = pl$concat( +#' 1:10, +#' pl$concat(1:5,1:5, rechunk = FALSE, how = "vertical")$rename("b"), +#' how = "horizontal" +#' ) +#' df +#' df$n_chunks() +DataFrame_n_chunks = function(strategy = "all") { + .pr$DataFrame$n_chunks(self, strategy) |> + unwrap("in n_chunks():") +} + + #' @title Get the last row of the DataFrame. #' @keywords DataFrame #' @return A DataFrame with one row. diff --git a/R/error__rpolarserr.R b/R/error__rpolarserr.R index af8cd83be..79fa27c68 100644 --- a/R/error__rpolarserr.R +++ b/R/error__rpolarserr.R @@ -58,7 +58,14 @@ Err_plain = function(...) { } # short hand for extracting an error context in unit testing, will raise error if not an RPolarsErr -get_err_ctx = \(x) unwrap_err(result(x))$contexts() +get_err_ctx = \(x, select = NULL) { + ctx = unwrap_err(result(x))$contexts() + if(is.null(select)) { + ctx + } else { + ctx[[match.arg(select,names(ctx))]] + } +} # wrapper to return Result diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 2cbe30350..c76e32d63 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -63,12 +63,8 @@ test_wrong_call_pl_lit <- function(robj) .Call(wrap__test_wrong_call_pl_lit, rob polars_features <- function() .Call(wrap__polars_features) -concat_df <- function(vdf) .Call(wrap__concat_df, vdf) - concat_lf <- function(l, rechunk, parallel, to_supertypes) .Call(wrap__concat_lf, l, rechunk, parallel, to_supertypes) -diag_concat_df <- function(dfs) .Call(wrap__diag_concat_df, dfs) - diag_concat_lf <- function(l, rechunk, parallel) .Call(wrap__diag_concat_lf, l, rechunk, parallel) hor_concat_df <- function(l) .Call(wrap__hor_concat_df, l) @@ -117,6 +113,8 @@ DataFrame <- new.env(parent = emptyenv()) DataFrame$shape <- function() .Call(wrap__DataFrame__shape, self) +DataFrame$n_chunks <- function(strategy) .Call(wrap__DataFrame__n_chunks, self, strategy) + DataFrame$clone_see_me_macro <- function() .Call(wrap__DataFrame__clone_see_me_macro, self) DataFrame$default <- function() .Call(wrap__DataFrame__default) diff --git a/R/functions__eager.R b/R/functions__eager.R index e1c5a7a30..c673b9fd8 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -42,11 +42,14 @@ pl$concat = function( rechunk = TRUE, how = c("vertical", "horizontal", "diagonal"),#, "vertical_relaxed","diangonal_relaxed"), parallel = TRUE, - eager = FALSE, + #eager = FALSE, to_supertypes = FALSE ) { - browser() - l = unpack_list(...) + if(exists("do_browse", .GlobalEnv) && do_browse) browser() + + #unpack arg list + l = unpack_list(..., skip_classes = "data.frame") + ## Check inputs how_args = c("vertical", "horizontal", "diagonal")#, "vertical_relaxed", "diangonal_relaxed") @@ -54,46 +57,64 @@ pl$concat = function( result() |> unwrap("in pl$concat()") - first = l[[1]] - eager = isTRUE(eager) #!inherits(first,"LazyFrame") - args_modified = names(as.list(sys.call()[-1])) + first = l[[1L]] + eager = !inherits(first,"LazyFrame") + args_modified = names(as.list(sys.call()[-1L])) # dispatch on item class and how - pcase( + Result_out = pcase( how == "vertical" && (inherits(first, "Series") || is.vector(first)), { - if(any(args_modified %in% c("eager","parallel", "to_super_types", "rechunk"))) { + if(any(args_modified %in% c("parallel"))) { warning( - "args: parallel and eager takes no effect when concat Series" + "in pl:concat(): args: parallel takes no effect when concatenating Series", + call. = FALSE ) } concat_series(l, rechunk, to_supertypes) }, how == "vertical", concat_lf(l, rechunk, parallel, to_supertypes), - how == "diagonal", diag_concat_lf(l, rechunk, parallel, to_supertypes), + how == "diagonal", { + if(any(args_modified %in% c("to_supertypes"))) { + warning( + "Args to_supertypes", + "takes no effect for how=='diagonal'", + call. = FALSE + ) + } + diag_concat_lf(l, rechunk, parallel) + }, how == "horizontal" && !eager, { - Err_plain("how=='horizontal' is not supported for !eager. Try e.g. $join() .") + Err_plain( + "how=='horizontal' is not supported for lazy (first element is LazyFrame).", + "Try e.g. $join() to get Lazy join or pl$concat(lf1$collect(),lf2,lf3).", + "to get a eager horizontal concatenation" + ) }, how == "horizontal", { - if(any(args_modified %in% c("parallel", "to_super_types"))) { + if(any(args_modified %in% c("parallel", "to_supertypes"))) { warning( - "args parallel, rechunk, eager and to_supertypes", - "takes no effect for how=='horizontal'" + "Args parallel, rechunk, eager and to_supertypes", + "takes no effect for how=='horizontal'", + call. = FALSE ) } - concat_df(l) + hor_concat_df(l) }, # TODO implement Series, Expr, Lazy etc - or_else = Err_plain("type of first list element: '", class(first), "' is not supported") + or_else = Err_plain("internal error:", how, "not handled") + + ) - ) |> and_then(\(x) { + #convert back from lazy if eager + and_then(Result_out, \(x) { pcase( inherits(x,"DataFrame") && !eager, Err_plain("internal logical error in pl$concat()"), - inherits(x,"LazyFrame") && eager, Ok(x$lazy()), + inherits(x,"LazyFrame") && eager, Ok(x$collect()), or_else = Ok(x) ) diff --git a/R/utils.R b/R/utils.R index 2434366a9..8aea0aada 100644 --- a/R/utils.R +++ b/R/utils.R @@ -93,6 +93,7 @@ list2 = list #' Internal unpack list #' @noRd #' @param l any list +#' @param skip_classes char vec, do not unpack list inherits skip_classes. #' @details py-polars syntax only allows e.g. `df.select([expr1, expr2,])` and not #' `df.select(expr1, expr2,)`. r-polars also allows user to directly write #' `df$select(expr1, expr2)` or `df$select(list(expr1,expr2))`. Unpack list @@ -103,9 +104,13 @@ list2 = list #' f = \(...) unpack_list(list(...)) #' identical(f(list(1L, 2L, 3L)), f(1L, 2L, 3L)) # is TRUE #' identical(f(list(1L, 2L), 3L), f(1L, 2L, 3L)) # is FALSE -unpack_list = function(...) { +unpack_list = function(..., skip_classes=NULL) { l = list2(...) - if (length(l) == 1L && is.list(l[[1L]])) { + if ( + length(l) == 1L && + is.list(l[[1L]]) && + !( !is.null(skip_classes) && inherits(l[[1L]],skip_classes) ) + ) { l[[1L]] } else { l diff --git a/man/DataFrame_n_chunks.Rd b/man/DataFrame_n_chunks.Rd new file mode 100644 index 000000000..7320844d7 --- /dev/null +++ b/man/DataFrame_n_chunks.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_n_chunks} +\alias{DataFrame_n_chunks} +\title{Get the number of chunks of the Series' in a DataFrame} +\usage{ +DataFrame_n_chunks(strategy = "all") +} +\arguments{ +\item{strategy}{string either 'all' or 'first'. 'first' only returns chunks for first Series.} +} +\value{ +real vector of chunk counts per Series. +} +\description{ +Get the number of chunks of the Series' in a DataFrame +} +\examples{ +df = pl$concat( + 1:10, + pl$concat(1:5,1:5, rechunk = FALSE, how = "vertical")$rename("b"), + how = "horizontal" +) +df +df$n_chunks() +} +\keyword{DataFrame} diff --git a/man/nanoarrow.Rd b/man/nanoarrow.Rd index 3ecaf02a4..7af2018a2 100644 --- a/man/nanoarrow.Rd +++ b/man/nanoarrow.Rd @@ -16,13 +16,13 @@ \alias{as_record_batch_reader.DataFrame} \title{polars to nanoarrow and arrow} \usage{ -\method{as_nanoarrow_array_stream}{DataFrame}(x, ..., schema = NULL) +as_nanoarrow_array_stream.DataFrame(x, ..., schema = NULL) -\method{infer_nanoarrow_schema}{DataFrame}(x, ...) +infer_nanoarrow_schema.DataFrame(x, ...) -\method{as_arrow_table}{DataFrame}(x, ...) +as_arrow_table.DataFrame(x, ...) -\method{as_record_batch_reader}{DataFrame}(x, ..., schema = NULL) +as_record_batch_reader.DataFrame(x, ..., schema = NULL) } \arguments{ \item{x}{a polars DataFrame} diff --git a/src/rust/src/concat.rs b/src/rust/src/concat.rs index a580a53e4..7ac7e873b 100644 --- a/src/rust/src/concat.rs +++ b/src/rust/src/concat.rs @@ -4,7 +4,6 @@ use crate::robj_to; use crate::rdataframe::LazyFrame; use crate::rpolarserr::*; use crate::series::Series; -use crate::{rdataframe::VecDataFrame, utils::r_result_list}; use extendr_api::prelude::*; use polars::lazy::dsl; use polars::prelude as pl; @@ -12,39 +11,6 @@ use polars_core; use polars_core::functions as pl_functions; use std::result::Result; -#[extendr] -fn concat_df(vdf: &VecDataFrame) -> List { - //-> PyResult { - - use polars_core::error::PolarsResult; - use polars_core::utils::rayon::prelude::*; - - let identity_df = (*vdf.0.iter().peekable().peek().unwrap()) - .clone() - .slice(0, 0); - let rdfs: Vec> = - vdf.0.iter().map(|df| Ok(df.clone())).collect(); - let identity = || Ok(identity_df.clone()); - - let result = polars_core::POOL - .install(|| { - rdfs.into_par_iter() - .fold(identity, |acc: PolarsResult, df| { - let mut acc = acc?; - acc.vstack_mut(&df?)?; - Ok(acc) - }) - .reduce(identity, |acc, df| { - let mut acc = acc?; - acc.vstack_mut(&df?)?; - Ok(acc) - }) - }) - .map(DataFrame); - - r_result_list(result.map_err(|err| format!("{:?}", err))) -} - #[extendr] fn concat_lf(l: Robj, rechunk: bool, parallel: bool, to_supertypes: bool) -> RResult { let vlf = robj_to!(Vec, PLLazyFrame, l)?; @@ -60,14 +26,6 @@ fn concat_lf(l: Robj, rechunk: bool, parallel: bool, to_supertypes: bool) -> RRe .map(LazyFrame) } -#[extendr] -fn diag_concat_df(dfs: Robj) -> RResult { - let df_vec = robj_to!(Vec, PLDataFrame, dfs)?; - pl_functions::diag_concat_df(&df_vec) - .map_err(polars_to_rpolars_err) - .map(DataFrame) -} - #[extendr] fn diag_concat_lf(l: Robj, rechunk: bool, parallel: bool) -> RResult { let vlf = robj_to!(Vec, PLLazyFrame, l)?; @@ -96,7 +54,7 @@ pub fn concat_series(l: Robj, rechunk: Robj, to_supertypes: Robj) -> RResult Err(err), - Ok(None) => Ok(Some(x)), + Ok(None) => Ok(Some(x)), // first fold, acc is None, just us x, Ok(Some(acc)) => polars_core::utils::get_supertype(&acc, &x) .ok_or(RPolarsErr::new().plain("Series' have no common supertype".to_string())) .map(|dt| Some(dt)), @@ -129,10 +87,7 @@ pub fn concat_series(l: Robj, rechunk: Robj, to_supertypes: Robj) -> RResult for DataFrame { DataFrame(item) } } - +use crate::rpolarserr::*; #[extendr] impl DataFrame { pub fn shape(&self) -> Robj { @@ -85,6 +85,25 @@ impl DataFrame { r!([shp.0, shp.1]) } + pub fn n_chunks(&self, strategy: Robj) -> RResult> { + let nchks: Vec<_> = self.0.iter().map(|s| s.n_chunks() as f64).collect(); + + match robj_to!(str, strategy)? { + "all" => Ok(nchks), + "first" => { + if nchks.is_empty() { + Ok(vec![]) + } else { + Ok(vec![nchks.into_iter().next().expect("has atleast len 1")]) + } + } + _ => { + Err(RPolarsErr::new() + .plain("strategy not recognized, neither 'all' or 'first'".into())) + } + } + } + //renamed back to clone pub fn clone_see_me_macro(&self) -> DataFrame { self.clone() diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index 6a158e3d5..cb74aed48 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -13,9 +13,11 @@ use std::any::type_name as tn; //use std::intrinsics::read_via_copy; use crate::lazy::dsl::robj_to_col; use crate::rdataframe::{DataFrame, LazyFrame}; +use extendr_api::eval_string_with_params; use extendr_api::Attributes; use extendr_api::ExternalPtr; use extendr_api::Result as ExtendrResult; +use extendr_api::R; use polars::prelude as pl; //macro to translate polars NULLs and emulate R NA value of any type @@ -759,6 +761,11 @@ pub fn robj_to_lazyframe(robj: extendr_api::Robj) -> RResult { let lf = LazyFrame(lf.0.clone()); Ok(lf) } + _ if robj.inherits("data.frame") => { + let df = unpack_r_eval(R!("polars:::result(pl$DataFrame({{robj}}))"))?; + let extptr_df: ExternalPtr = df.try_into()?; + Ok(extptr_df.lazy()) + } _ => Ok(DataFrame::new_with_capacity(1) .lazy() .0 @@ -786,6 +793,11 @@ pub fn robj_to_dataframe(robj: extendr_api::Robj) -> RResult { let lf: ExternalPtr = robj.try_into()?; lf.0.clone().collect() } + _ if robj.inherits("data.frame") => { + let df = unpack_r_eval(R!("polars:::result(pl$DataFrame({{robj}}))"))?; + let extptr_df: ExternalPtr = df.try_into()?; + Ok(extptr_df.0.clone()) + } _ => DataFrame::new_with_capacity(1) .lazy() .0 diff --git a/tests/testthat/test-concat.R b/tests/testthat/test-concat.R index f448a692f..ffdb83836 100644 --- a/tests/testthat/test-concat.R +++ b/tests/testthat/test-concat.R @@ -1,20 +1,55 @@ test_that("concat dataframe", { - # vertical - l_ver = lapply(1:10, function(i) { + + # vertical dfs + l_ver = lapply(1:3, function(i) { l_internal = list( a = 1:5, b = letters[1:5] ) pl$DataFrame(l_internal) }) + + df_ver = pl$concat(l_ver, how = "vertical") expect_equal( df_ver$to_data_frame(), do.call(rbind, lapply(l_ver, function(df) df$to_data_frame())) ) + # unpack args allowed + df_ver_2 = pl$concat(l_ver[[1L]],l_ver[[2L]],l_ver[[3L]], how="vertical") + expect_identical(df_ver$to_list(),df_ver_2$to_list()) + + # use supertypes + expect_identical( + pl$concat(l_ver[[1L]],pl$DataFrame(a=2,b=42L), how="vertical",to_supertypes = TRUE)$to_list(), + pl$DataFrame(rbind(data.frame(a = 1:5, b = letters[1:5]), data.frame(a = 2, b = 42L)))$to_list() + ) + + # type 'relaxed' vertical concatenation is not allowed by default + expect_true( + pl$concat(l_ver[[1L]],pl$DataFrame(a=2,b=42L), how="vertical") |> + get_err_ctx() |> + (\(ctx) ctx$PolarsError)() |> + grepl(pat = "dtypes for column", fixed = TRUE) + ) + + + #check lazy eager is identical + l_ver_lazy = lapply(l_ver, \(df) df$lazy()) + expect_identical( + pl$concat(l_ver_lazy)$collect()$to_list(), + pl$concat(l_ver)$to_list() + ) + + #check rechunk works + expect_identical( pl$concat(mtcars, mtcars, rechunk = TRUE)$n_chunks(), rep(1, 11)) + expect_identical( pl$concat(mtcars, mtcars, rechunk = FALSE)$n_chunks(), rep(2, 11)) + + + # horizontal - l_hor = lapply(1:10, function(i) { + l_hor = lapply(1:5, function(i) { l_internal = list( 1:5, letters[1:5] @@ -28,8 +63,28 @@ test_that("concat dataframe", { do.call(cbind, lapply(l_hor, function(df) df$to_data_frame())) ) - # diagonal + pl$concat(pl$LazyFrame(a=1:3), how = "horizontal") |> + get_err_ctx( "Plain") |> + startsWith("how=='horizontal' is not supported for lazy") |> + expect_true() + + # can concat Series + expect_identical( + pl$concat(1:5,pl$Series(5:1,"b"), how = "horizontal")$to_list(), + list(1:5,b=5:1) + ) + + + # diagonal eager df_dia = pl$concat(l_hor, how = "diagonal") - expect_equal(df_dia$shape, c(50, 20)) - expect_equal(mean(is.na(df_dia$to_data_frame())), 9 / 10) + expect_equal(df_dia$shape, c(25, 10)) + expect_equal(mean(is.na(df_dia$to_data_frame())), 8 / 10) + + #diagonal lazy + lf_dia = pl$concat(l_hor |> lapply(pl$LazyFrame), how = "diagonal") + expect_identical( + lf_dia$collect()$to_list(), + df_dia$to_list() + ) + }) diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 6cf7293ee..073f81df0 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -775,6 +775,25 @@ test_that("join_asof_simple", { ) }) +test_that("n_chunks", { + + df = pl$concat( + 1:10, + pl$concat(1:5,1:5, rechunk = FALSE, how = "vertical")$rename("b"), + how = "horizontal" + ) + + expect_identical( df$n_chunks(), c(1,2)) + expect_identical( df$n_chunks("first"), c(1)) + expect_identical( pl$DataFrame()$n_chunks(), numeric()) + expect_identical( pl$DataFrame()$n_chunks("first"), numeric()) + + pl$DataFrame()$n_chunks("wrong strat") |> + get_err_ctx("Plain") |> + grepl(pat = "strategy") |> + expect_true() +}) + test_that("melt example", { df = pl$DataFrame( From 3f8ef34a3b8f09b329df447f24f390ff7b93cfef Mon Sep 17 00:00:00 2001 From: sorhawell Date: Sun, 1 Oct 2023 23:33:07 +0300 Subject: [PATCH 04/14] fmt --- R/dataframe__frame.R | 2 +- R/error__rpolarserr.R | 6 ++-- R/expr__meta.R | 3 +- R/functions__eager.R | 60 ++++++++++++++++----------------- R/lazyframe__lazy.R | 12 +++---- R/utils.R | 6 ++-- tests/testthat/test-concat.R | 34 +++++++++---------- tests/testthat/test-dataframe.R | 19 +++++------ tests/testthat/test-expr_arr.R | 1 - 9 files changed, 67 insertions(+), 76 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 9058074cb..db998bd31 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1065,7 +1065,7 @@ DataFrame_first = function() { #' @examples #' df = pl$concat( #' 1:10, -#' pl$concat(1:5,1:5, rechunk = FALSE, how = "vertical")$rename("b"), +#' pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), #' how = "horizontal" #' ) #' df diff --git a/R/error__rpolarserr.R b/R/error__rpolarserr.R index 79fa27c68..b851e3bb9 100644 --- a/R/error__rpolarserr.R +++ b/R/error__rpolarserr.R @@ -54,16 +54,16 @@ bad_robj = function(r) { } Err_plain = function(...) { - Err(.pr$RPolarsErr$new()$plain(paste(..., collapse=" "))) + Err(.pr$RPolarsErr$new()$plain(paste(..., collapse = " "))) } # short hand for extracting an error context in unit testing, will raise error if not an RPolarsErr get_err_ctx = \(x, select = NULL) { ctx = unwrap_err(result(x))$contexts() - if(is.null(select)) { + if (is.null(select)) { ctx } else { - ctx[[match.arg(select,names(ctx))]] + ctx[[match.arg(select, names(ctx))]] } } diff --git a/R/expr__meta.R b/R/expr__meta.R index 76a6e7d8d..d88f7026c 100644 --- a/R/expr__meta.R +++ b/R/expr__meta.R @@ -166,9 +166,8 @@ ExprMeta_is_regex_projection = function() { #' @examples #' my_expr = (pl$col("foo") * pl$col("bar"))$sum()$over(pl$col("ham")) / 2 #' my_expr$meta$tree_format() - ExprMeta_tree_format = function(return_as_string = FALSE) { - out <- .pr$Expr$meta_tree_format(self) |> + out = .pr$Expr$meta_tree_format(self) |> unwrap("in $tree_format():") if (isTRUE(return_as_string)) { out diff --git a/R/functions__eager.R b/R/functions__eager.R index c673b9fd8..a92a5ec62 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -40,32 +40,32 @@ pl$concat = function( ..., # list of DataFrames or Series or lazyFrames or expr rechunk = TRUE, - how = c("vertical", "horizontal", "diagonal"),#, "vertical_relaxed","diangonal_relaxed"), + how = c("vertical", "horizontal", "diagonal"), # , "vertical_relaxed","diangonal_relaxed"), parallel = TRUE, - #eager = FALSE, - to_supertypes = FALSE - ) { - if(exists("do_browse", .GlobalEnv) && do_browse) browser() + # eager = FALSE, + to_supertypes = FALSE) { + if (exists("do_browse", .GlobalEnv) && do_browse) browser() - #unpack arg list + # unpack arg list l = unpack_list(..., skip_classes = "data.frame") ## Check inputs - how_args = c("vertical", "horizontal", "diagonal")#, "vertical_relaxed", "diangonal_relaxed") + how_args = c("vertical", "horizontal", "diagonal") # , "vertical_relaxed", "diangonal_relaxed") how = match.arg(how[1L], how_args) |> result() |> unwrap("in pl$concat()") first = l[[1L]] - eager = !inherits(first,"LazyFrame") + eager = !inherits(first, "LazyFrame") args_modified = names(as.list(sys.call()[-1L])) # dispatch on item class and how Result_out = pcase( - how == "vertical" && (inherits(first, "Series") || is.vector(first)), { - if(any(args_modified %in% c("parallel"))) { + how == "vertical" && (inherits(first, "Series") || is.vector(first)), + { + if (any(args_modified %in% c("parallel"))) { warning( "in pl:concat(): args: parallel takes no effect when concatenating Series", call. = FALSE @@ -73,10 +73,11 @@ pl$concat = function( } concat_series(l, rechunk, to_supertypes) }, - - how == "vertical", concat_lf(l, rechunk, parallel, to_supertypes), - how == "diagonal", { - if(any(args_modified %in% c("to_supertypes"))) { + how == "vertical", + concat_lf(l, rechunk, parallel, to_supertypes), + how == "diagonal", + { + if (any(args_modified %in% c("to_supertypes"))) { warning( "Args to_supertypes", "takes no effect for how=='diagonal'", @@ -85,17 +86,17 @@ pl$concat = function( } diag_concat_lf(l, rechunk, parallel) }, - - how == "horizontal" && !eager, { - Err_plain( - "how=='horizontal' is not supported for lazy (first element is LazyFrame).", - "Try e.g. $join() to get Lazy join or pl$concat(lf1$collect(),lf2,lf3).", - "to get a eager horizontal concatenation" - ) + how == "horizontal" && !eager, + { + Err_plain( + "how=='horizontal' is not supported for lazy (first element is LazyFrame).", + "Try e.g. $join() to get Lazy join or pl$concat(lf1$collect(),lf2,lf3).", + "to get a eager horizontal concatenation" + ) }, - - how == "horizontal", { - if(any(args_modified %in% c("parallel", "to_supertypes"))) { + how == "horizontal", + { + if (any(args_modified %in% c("parallel", "to_supertypes"))) { warning( "Args parallel, rechunk, eager and to_supertypes", "takes no effect for how=='horizontal'", @@ -107,17 +108,15 @@ pl$concat = function( # TODO implement Series, Expr, Lazy etc or_else = Err_plain("internal error:", how, "not handled") - ) - #convert back from lazy if eager + # convert back from lazy if eager and_then(Result_out, \(x) { pcase( - inherits(x,"DataFrame") && !eager, Err_plain("internal logical error in pl$concat()"), - inherits(x,"LazyFrame") && eager, Ok(x$collect()), + inherits(x, "DataFrame") && !eager, Err_plain("internal logical error in pl$concat()"), + inherits(x, "LazyFrame") && eager, Ok(x$collect()), or_else = Ok(x) ) - }) |> unwrap( "in pl$concat()" ) @@ -185,8 +184,7 @@ pl$date_range = function( name = NULL, # : str | None = None, time_unit = "us", time_zone = NULL, # : str | None = None - explode = TRUE - ) { + explode = TRUE) { if (missing(end)) { end = start interval = "1h" diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index f701ebde1..274af118a 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -353,7 +353,6 @@ LazyFrame_collect = function( streaming ) |> and_then(collect_f) |> - unwrap("in $collect():") } @@ -1320,10 +1319,10 @@ LazyFrame_clone = function() { #' b = c("one", "two", "three", "four", "five"), #' c = 6:10 #' )$ -#' select( -#' pl$col("b")$to_struct(), -#' pl$col("a", "c")$to_struct()$alias("a_and_c") -#' ) +#' select( +#' pl$col("b")$to_struct(), +#' pl$col("a", "c")$to_struct()$alias("a_and_c") +#' ) #' lf$collect() #' #' # by default, all struct columns are unnested @@ -1331,10 +1330,9 @@ LazyFrame_clone = function() { #' #' # we can specify specific columns to unnest #' lf$unnest("a_and_c")$collect() - LazyFrame_unnest = function(names = NULL) { if (is.null(names)) { - names <- names(which(dtypes_are_struct(.pr$LazyFrame$schema(self)$ok))) + names = names(which(dtypes_are_struct(.pr$LazyFrame$schema(self)$ok))) } unwrap(.pr$LazyFrame$unnest(self, names), "in $unnest():") } diff --git a/R/utils.R b/R/utils.R index 8aea0aada..8f18924cc 100644 --- a/R/utils.R +++ b/R/utils.R @@ -104,12 +104,12 @@ list2 = list #' f = \(...) unpack_list(list(...)) #' identical(f(list(1L, 2L, 3L)), f(1L, 2L, 3L)) # is TRUE #' identical(f(list(1L, 2L), 3L), f(1L, 2L, 3L)) # is FALSE -unpack_list = function(..., skip_classes=NULL) { +unpack_list = function(..., skip_classes = NULL) { l = list2(...) if ( length(l) == 1L && - is.list(l[[1L]]) && - !( !is.null(skip_classes) && inherits(l[[1L]],skip_classes) ) + is.list(l[[1L]]) && + !(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) ) { l[[1L]] } else { diff --git a/tests/testthat/test-concat.R b/tests/testthat/test-concat.R index ffdb83836..8851a6fc3 100644 --- a/tests/testthat/test-concat.R +++ b/tests/testthat/test-concat.R @@ -1,5 +1,4 @@ test_that("concat dataframe", { - # vertical dfs l_ver = lapply(1:3, function(i) { l_internal = list( @@ -17,34 +16,34 @@ test_that("concat dataframe", { ) # unpack args allowed - df_ver_2 = pl$concat(l_ver[[1L]],l_ver[[2L]],l_ver[[3L]], how="vertical") - expect_identical(df_ver$to_list(),df_ver_2$to_list()) + df_ver_2 = pl$concat(l_ver[[1L]], l_ver[[2L]], l_ver[[3L]], how = "vertical") + expect_identical(df_ver$to_list(), df_ver_2$to_list()) # use supertypes expect_identical( - pl$concat(l_ver[[1L]],pl$DataFrame(a=2,b=42L), how="vertical",to_supertypes = TRUE)$to_list(), + pl$concat(l_ver[[1L]], pl$DataFrame(a = 2, b = 42L), how = "vertical", to_supertypes = TRUE)$to_list(), pl$DataFrame(rbind(data.frame(a = 1:5, b = letters[1:5]), data.frame(a = 2, b = 42L)))$to_list() ) # type 'relaxed' vertical concatenation is not allowed by default expect_true( - pl$concat(l_ver[[1L]],pl$DataFrame(a=2,b=42L), how="vertical") |> - get_err_ctx() |> - (\(ctx) ctx$PolarsError)() |> - grepl(pat = "dtypes for column", fixed = TRUE) + pl$concat(l_ver[[1L]], pl$DataFrame(a = 2, b = 42L), how = "vertical") |> + get_err_ctx() |> + (\(ctx) ctx$PolarsError)() |> + grepl(pat = "dtypes for column", fixed = TRUE) ) - #check lazy eager is identical + # check lazy eager is identical l_ver_lazy = lapply(l_ver, \(df) df$lazy()) expect_identical( pl$concat(l_ver_lazy)$collect()$to_list(), pl$concat(l_ver)$to_list() ) - #check rechunk works - expect_identical( pl$concat(mtcars, mtcars, rechunk = TRUE)$n_chunks(), rep(1, 11)) - expect_identical( pl$concat(mtcars, mtcars, rechunk = FALSE)$n_chunks(), rep(2, 11)) + # check rechunk works + expect_identical(pl$concat(mtcars, mtcars, rechunk = TRUE)$n_chunks(), rep(1, 11)) + expect_identical(pl$concat(mtcars, mtcars, rechunk = FALSE)$n_chunks(), rep(2, 11)) @@ -63,15 +62,15 @@ test_that("concat dataframe", { do.call(cbind, lapply(l_hor, function(df) df$to_data_frame())) ) - pl$concat(pl$LazyFrame(a=1:3), how = "horizontal") |> - get_err_ctx( "Plain") |> + pl$concat(pl$LazyFrame(a = 1:3), how = "horizontal") |> + get_err_ctx("Plain") |> startsWith("how=='horizontal' is not supported for lazy") |> expect_true() # can concat Series expect_identical( - pl$concat(1:5,pl$Series(5:1,"b"), how = "horizontal")$to_list(), - list(1:5,b=5:1) + pl$concat(1:5, pl$Series(5:1, "b"), how = "horizontal")$to_list(), + list(1:5, b = 5:1) ) @@ -80,11 +79,10 @@ test_that("concat dataframe", { expect_equal(df_dia$shape, c(25, 10)) expect_equal(mean(is.na(df_dia$to_data_frame())), 8 / 10) - #diagonal lazy + # diagonal lazy lf_dia = pl$concat(l_hor |> lapply(pl$LazyFrame), how = "diagonal") expect_identical( lf_dia$collect()$to_list(), df_dia$to_list() ) - }) diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 073f81df0..cd2e8a11c 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -776,22 +776,21 @@ test_that("join_asof_simple", { }) test_that("n_chunks", { - df = pl$concat( 1:10, - pl$concat(1:5,1:5, rechunk = FALSE, how = "vertical")$rename("b"), + pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), how = "horizontal" ) - expect_identical( df$n_chunks(), c(1,2)) - expect_identical( df$n_chunks("first"), c(1)) - expect_identical( pl$DataFrame()$n_chunks(), numeric()) - expect_identical( pl$DataFrame()$n_chunks("first"), numeric()) + expect_identical(df$n_chunks(), c(1, 2)) + expect_identical(df$n_chunks("first"), c(1)) + expect_identical(pl$DataFrame()$n_chunks(), numeric()) + expect_identical(pl$DataFrame()$n_chunks("first"), numeric()) - pl$DataFrame()$n_chunks("wrong strat") |> - get_err_ctx("Plain") |> - grepl(pat = "strategy") |> - expect_true() + pl$DataFrame()$n_chunks("wrong strat") |> + get_err_ctx("Plain") |> + grepl(pat = "strategy") |> + expect_true() }) diff --git a/tests/testthat/test-expr_arr.R b/tests/testthat/test-expr_arr.R index e11138021..fc4d26c31 100644 --- a/tests/testthat/test-expr_arr.R +++ b/tests/testthat/test-expr_arr.R @@ -436,4 +436,3 @@ test_that("eval", { ) ) }) - From 1336f502c67946375f2ee003204b10649c9a302f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Havelund=20Welling?= Date: Mon, 2 Oct 2023 17:36:33 +0300 Subject: [PATCH 05/14] Apply suggestions from code review Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- R/functions__eager.R | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/R/functions__eager.R b/R/functions__eager.R index a92a5ec62..4a4ab45e7 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -44,7 +44,6 @@ pl$concat = function( parallel = TRUE, # eager = FALSE, to_supertypes = FALSE) { - if (exists("do_browse", .GlobalEnv) && do_browse) browser() # unpack arg list l = unpack_list(..., skip_classes = "data.frame") @@ -67,7 +66,7 @@ pl$concat = function( { if (any(args_modified %in% c("parallel"))) { warning( - "in pl:concat(): args: parallel takes no effect when concatenating Series", + "in pl$concat(): argument `parallel` is not used when concatenating Series", call. = FALSE ) } @@ -79,8 +78,7 @@ pl$concat = function( { if (any(args_modified %in% c("to_supertypes"))) { warning( - "Args to_supertypes", - "takes no effect for how=='diagonal'", + "Argument `to_supertypes` is not used when how=='diagonal'", call. = FALSE ) } @@ -90,7 +88,7 @@ pl$concat = function( { Err_plain( "how=='horizontal' is not supported for lazy (first element is LazyFrame).", - "Try e.g. $join() to get Lazy join or pl$concat(lf1$collect(),lf2,lf3).", + "Try e.g. $join() to get Lazy join or pl$concat(lf1$collect(), lf2, lf3).", "to get a eager horizontal concatenation" ) }, @@ -98,8 +96,7 @@ pl$concat = function( { if (any(args_modified %in% c("parallel", "to_supertypes"))) { warning( - "Args parallel, rechunk, eager and to_supertypes", - "takes no effect for how=='horizontal'", + "Arguments `parallel`, `rechunk`, `eager` and `to_supertypes` are not used when how=='horizontal'", call. = FALSE ) } From 7ecd49cc6efc6463c9bb1fcb1d5dc81cbd15bfdf Mon Sep 17 00:00:00 2001 From: sorhawell Date: Tue, 3 Oct 2023 11:19:19 +0300 Subject: [PATCH 06/14] add also rechunk improve docs --- R/dataframe__frame.R | 115 +++++++++++++++++++++++++++++++++++++- R/extendr-wrappers.R | 2 + man/DataFrame_n_chunks.Rd | 55 ++++++++++++++++-- man/DataFrame_rechunk.Rd | 72 ++++++++++++++++++++++++ 4 files changed, 237 insertions(+), 7 deletions(-) create mode 100644 man/DataFrame_rechunk.Rd diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 0340a4647..e01f10365 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1034,24 +1034,133 @@ DataFrame_first = function() { } -#' @title Get the number of chunks of the Series' in a DataFrame +#' @title Number of chunks of the Series' in a DataFrame +#' @description Number of chunks (mem allocations) for all or first Series' in a DataFrame. +#' @details +#' DataFrame is a vector of Series'. Each Series in rust-polars is a wrapper around a ChunkedArray, +#' which is like a virtual contiguous vector, which is physically backed by an ordered set of +#' chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays +#' are a fast, thread-safe and cross-platform memory layout. +#' +#' In R when combining with `c()` or `rbind()`, it requires immediate vector re-allocation to place +#' vector values in contiguous memory, hence repeated appending to a vector in R is discouraged +#' because slow. +#' +#' When concatenating/appending to Series' or DataFrame' re-allocation can be avoided or delayed, +#' by simply appending chunks to each individual Series. +#' +#' However, if chunks become many and small or are misaligned across Series, this can hurt the +#' performance of following operations. +#' +#' Most places in the polars api where chunking could occur, the user have to +#' typically actively opt-out by setting and arg `rechunk = FALSE`. Thus it is not normal +#' #' @keywords DataFrame #' @param strategy string either 'all' or 'first'. 'first' only returns chunks for first Series. #' @return real vector of chunk counts per Series. +#' @seealso [`$rechunk()`][DataFrame_rechunk] #' @examples +#' # create DataFrame with miss aligned chunks #' df = pl$concat( -#' 1:10, -#' pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), +#' 1:10, # single chunk +#' pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks #' how = "horizontal" #' ) #' df #' df$n_chunks() +#' +#' # rechunk a chunked DataFrame +#' df$rechunk()$n_chunks() +#' +#' # rechunk is not an in-place operation +#' df$n_chunks() +#' +#' # The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with +#' # same type of vectors and where have all relevant S3 generics implemented to make behave as if +#' # it was a regular vector. +#' "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") +#' print.chunked_vector = \(x, ...) print(unlist(x), ...) +#' c.chunked_vector = \(...) structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +#' rechunk = \(x) structure(unlist(x), class = "chunked_vector") +#' x = structure(list(1:4, 5L), class = "chunked_vector") +#' x +#' x + 5:1 +#' lapply(x, tracemem) # trace chunks to verify no re-allocation +#' z = c(x, x) +#' z # looks like a plain vector +#' lapply(z, tracemem) # mem allocation in z are the same from x +#' str(z) +#' z = rechunk(z) +#' str(z) DataFrame_n_chunks = function(strategy = "all") { .pr$DataFrame$n_chunks(self, strategy) |> unwrap("in n_chunks():") } +#' @title Rechunk a DataFrame +#' @description Rechunking re-allocates any "chunked" memory allocations to speed-up e.g. vectorized +#' operations. +#' @details +#' DataFrame is a vector of Series'. Each Series in rust-polars is a wrapper around a ChunkedArray, +#' which is like a virtual contiguous vector, which is physically backed by an ordered set of +#' chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays +#' are a fast, thread-safe and cross-platform memory layout. +#' +#' In R when combining with `c()` or `rbind()`, it requires immediate vector re-allocation to place +#' vector values in contiguous memory, hence repeated appending to a vector in R is discouraged +#' because slow. +#' +#' When concatenating/appending to Series' or DataFrame' re-allocation can be avoided or delayed, +#' by simply appending chunks to each individual Series. +#' +#' However, if chunks become many and small or are misaligned across Series, this can hurt the +#' performance of following operations. +#' +#' Most places in the polars api where chunking could occur, the user have to +#' typically actively opt-out by setting and arg `rechunk = FALSE`. Thus it is not normal +#' +#' @keywords DataFrame +#' @return real vector of chunk counts per Series. +#' @seealso [`$n_chunks()`][DataFrame_n_chunks] +#' @examples +#' # create DataFrame with miss alligned chunks +#' df = pl$concat( +#' 1:10, # single chunk +#' pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks +#' how = "horizontal" +#' ) +#' df +#' df$n_chunks() +#' +#' # rechunk a chunked DataFrame +#' df$rechunk()$n_chunks() +#' +#' # rechunk is not an in-place operation +#' df$n_chunks() +#' +#' # The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with +#' # same type of vectors and where have all relevant S3 generics implemented to make behave as if +#' # it was a regular vector. +#' "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") +#' print.chunked_vector = \(x, ...) print(unlist(x), ...) +#' c.chunked_vector = \(...) structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +#' rechunk = \(x) structure(unlist(x), class = "chunked_vector") +#' x = structure(list(1:4, 5L), class = "chunked_vector") +#' x +#' x + 5:1 +#' lapply(x, tracemem) # trace chunks to verify no re-allocation +#' z = c(x, x) +#' z # looks like a plain vector +#' lapply(z, tracemem) # mem allocation in z are the same from x +#' str(z) +#' z = rechunk(z) +#' str(z) +DataFrame_rechunk = function() { + .pr$DataFrame$rechunk(self) +} + + #' @title Get the last row of the DataFrame. #' @keywords DataFrame #' @return A DataFrame with one row. diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index c76e32d63..0be24bfc1 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -115,6 +115,8 @@ DataFrame$shape <- function() .Call(wrap__DataFrame__shape, self) DataFrame$n_chunks <- function(strategy) .Call(wrap__DataFrame__n_chunks, self, strategy) +DataFrame$rechunk <- function() .Call(wrap__DataFrame__rechunk, self) + DataFrame$clone_see_me_macro <- function() .Call(wrap__DataFrame__clone_see_me_macro, self) DataFrame$default <- function() .Call(wrap__DataFrame__default) diff --git a/man/DataFrame_n_chunks.Rd b/man/DataFrame_n_chunks.Rd index 7320844d7..741043e79 100644 --- a/man/DataFrame_n_chunks.Rd +++ b/man/DataFrame_n_chunks.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/dataframe__frame.R \name{DataFrame_n_chunks} \alias{DataFrame_n_chunks} -\title{Get the number of chunks of the Series' in a DataFrame} +\title{Number of chunks of the Series' in a DataFrame} \usage{ DataFrame_n_chunks(strategy = "all") } @@ -13,15 +13,62 @@ DataFrame_n_chunks(strategy = "all") real vector of chunk counts per Series. } \description{ -Get the number of chunks of the Series' in a DataFrame +Number of chunks (mem allocations) for all or first Series' in a DataFrame. +} +\details{ +DataFrame is a vector of Series'. Each Series in rust-polars is a wrapper around a ChunkedArray, +which is like a virtual contiguous vector, which is physically backed by an ordered set of +chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays +are a fast, thread-safe and cross-platform memory layout. + +In R when combining with \code{c()} or \code{rbind()}, it requires immediate vector re-allocation to place +vector values in contiguous memory, hence repeated appending to a vector in R is discouraged +because slow. + +When concatenating/appending to Series' or DataFrame' re-allocation can be avoided or delayed, +by simply appending chunks to each individual Series. + +However, if chunks become many and small or are misaligned across Series, this can hurt the +performance of following operations. + +Most places in the polars api where chunking could occur, the user have to +typically actively opt-out by setting and arg \code{rechunk = FALSE}. Thus it is not normal } \examples{ +# create DataFrame with miss aligned chunks df = pl$concat( - 1:10, - pl$concat(1:5,1:5, rechunk = FALSE, how = "vertical")$rename("b"), + 1:10, # single chunk + pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks how = "horizontal" ) df df$n_chunks() + +# rechunk a chunked DataFrame +df$rechunk()$n_chunks() + +# rechunk is not an in-place operation +df$n_chunks() + +# The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with +# same type of vectors and where have all relevant S3 generics implemented to make behave as if +# it was a regular vector. +'+.chunked_vector' = \(x, y) structure(list(unlist(x) + unlist(y)),class = "chunked_vector") +print.chunked_vector = \(x, ...) print(unlist(x), ...) +c.chunked_vector = \(...) structure(do.call(c,lapply(list(...),unclass)),class="chunked_vector") +rechunk = \(x) structure(unlist(x),class = "chunked_vector") +x = structure(list(1:4, 5L),class = "chunked_vector") +x +x + 5:1 +lapply(x, tracemem) # trace chunks to verify no re-allocation +z = c(x, x) +z # looks like a plain vector +lapply(z, tracemem) # mem allocation in z are the same from x +str(z) +z = rechunk(z) +str(z) +} +\seealso{ +\code{\link[=DataFrame_rechunk]{$rechunk()}} } \keyword{DataFrame} diff --git a/man/DataFrame_rechunk.Rd b/man/DataFrame_rechunk.Rd new file mode 100644 index 000000000..3f8c27a3d --- /dev/null +++ b/man/DataFrame_rechunk.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_rechunk} +\alias{DataFrame_rechunk} +\title{Rechunk a DataFrame} +\usage{ +DataFrame_rechunk() +} +\value{ +real vector of chunk counts per Series. +} +\description{ +Rechunking re-allocates any "chunked" memory allocations to speed-up e.g. vectorized +operations. +} +\details{ +DataFrame is a vector of Series'. Each Series in rust-polars is a wrapper around a ChunkedArray, +which is like a virtual contiguous vector, which is physically backed by an ordered set of +chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays +are a fast, thread-safe and cross-platform memory layout. + +In R when combining with \code{c()} or \code{rbind()}, it requires immediate vector re-allocation to place +vector values in contiguous memory, hence repeated appending to a vector in R is discouraged +because slow. + +When concatenating/appending to Series' or DataFrame' re-allocation can be avoided or delayed, +by simply appending chunks to each individual Series. + +However, if chunks become many and small or are misaligned across Series, this can hurt the +performance of following operations. + +Most places in the polars api where chunking could occur, the user have to +typically actively opt-out by setting and arg \code{rechunk = FALSE}. Thus it is not normal +} +\examples{ +# create DataFrame with miss alligned chunks +df = pl$concat( + 1:10, # single chunk + pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks + how = "horizontal" +) +df +df$n_chunks() + +# rechunk a chunked DataFrame +df$rechunk()$n_chunks() + +# rechunk is not an in-place operation +df$n_chunks() + +# The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with +# same type of vectors and where have all relevant S3 generics implemented to make behave as if +# it was a regular vector. +'+.chunked_vector' = \(x, y) structure(list(unlist(x) + unlist(y)),class = "chunked_vector") +print.chunked_vector = \(x, ...) print(unlist(x), ...) +c.chunked_vector = \(...) structure(do.call(c,lapply(list(...),unclass)),class="chunked_vector") +rechunk = \(x) structure(unlist(x),class = "chunked_vector") +x = structure(list(1:4, 5L),class = "chunked_vector") +x +x + 5:1 +lapply(x, tracemem) # trace chunks to verify no re-allocation +z = c(x, x) +z # looks like a plain vector +lapply(z, tracemem) # mem allocation in z are the same from x +str(z) +z = rechunk(z) +str(z) +} +\seealso{ +\code{\link[=DataFrame_n_chunks]{$n_chunks()}} +} +\keyword{DataFrame} From c7f14cd76840f2043e5d9fd15dab5ba4ead6099f Mon Sep 17 00:00:00 2001 From: sorhawell Date: Tue, 3 Oct 2023 11:19:50 +0300 Subject: [PATCH 07/14] pl$concat() returns now NULL --- R/functions__eager.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/functions__eager.R b/R/functions__eager.R index 4a4ab45e7..6b4adcdd8 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -44,10 +44,14 @@ pl$concat = function( parallel = TRUE, # eager = FALSE, to_supertypes = FALSE) { - # unpack arg list l = unpack_list(..., skip_classes = "data.frame") + # nothing becomes NULL + if (length(l) == 0L) { + return(NULL) + } + ## Check inputs how_args = c("vertical", "horizontal", "diagonal") # , "vertical_relaxed", "diangonal_relaxed") From 86875a690818ae9b6a943674d460dbe949b31bff Mon Sep 17 00:00:00 2001 From: sorhawell Date: Tue, 3 Oct 2023 11:20:18 +0300 Subject: [PATCH 08/14] rechunk rs --- src/rust/src/rdataframe/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 18a73cc88..2e4c77433 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -104,6 +104,10 @@ impl DataFrame { } } + pub fn rechunk(&self) -> Self { + self.0.agg_chunks().into() + } + //renamed back to clone pub fn clone_see_me_macro(&self) -> DataFrame { self.clone() From 67668cf5d25655731d337c068bc6a8f795d58e3f Mon Sep 17 00:00:00 2001 From: sorhawell Date: Tue, 3 Oct 2023 11:20:31 +0300 Subject: [PATCH 09/14] fmt --- R/dataframe__frame.R | 5 ++--- R/lazyframe__lazy.R | 1 - man/LazyFrame_unnest.Rd | 8 ++++---- man/as_polars_series.Rd | 4 ++-- man/nanoarrow.Rd | 8 ++++---- tests/testthat/test-dataframe.R | 2 +- tests/testthat/test-lazy.R | 12 ++++++------ 7 files changed, 19 insertions(+), 21 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index e01f10365..0efcd191d 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -133,7 +133,6 @@ DataFrame #' #' # custom schema #' pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8)) - pl$DataFrame = function(..., make_names_unique = TRUE, schema = NULL) { largs = unpack_list(...) @@ -181,9 +180,9 @@ pl$DataFrame = function(..., make_names_unique = TRUE, schema = NULL) { names(largs) = keys lapply(seq_along(largs), \(x) { varname = keys[x] - out <- pl$lit(largs[[x]]) + out = pl$lit(largs[[x]]) if (!is.null(schema) && varname %in% names(schema)) { - out <- out$cast(schema[[varname]], strict = TRUE) + out = out$cast(schema[[varname]], strict = TRUE) } out$alias(varname) }) |> diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index ad844460a..6b556b001 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -146,7 +146,6 @@ LazyFrame #' iris, #' schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8) #' )$collect() - pl$LazyFrame = function(...) { pl$DataFrame(...)$lazy() } diff --git a/man/LazyFrame_unnest.Rd b/man/LazyFrame_unnest.Rd index d8414d15a..0f884bbdb 100644 --- a/man/LazyFrame_unnest.Rd +++ b/man/LazyFrame_unnest.Rd @@ -23,10 +23,10 @@ lf = pl$LazyFrame( b = c("one", "two", "three", "four", "five"), c = 6:10 )$ - select( - pl$col("b")$to_struct(), - pl$col("a", "c")$to_struct()$alias("a_and_c") - ) + select( + pl$col("b")$to_struct(), + pl$col("a", "c")$to_struct()$alias("a_and_c") +) lf$collect() # by default, all struct columns are unnested diff --git a/man/as_polars_series.Rd b/man/as_polars_series.Rd index b835e0ca8..2081b9eb4 100644 --- a/man/as_polars_series.Rd +++ b/man/as_polars_series.Rd @@ -21,9 +21,9 @@ Internal generic method to convert some Robj into Series pl$Series(1:5) -#warning this method makes polars very useless +# warning this method makes polars very useless as_polars_series.numeric = function(x, ...) { - head(x,3) + head(x, 3) } pl$Series(1:5) diff --git a/man/nanoarrow.Rd b/man/nanoarrow.Rd index 7af2018a2..3ecaf02a4 100644 --- a/man/nanoarrow.Rd +++ b/man/nanoarrow.Rd @@ -16,13 +16,13 @@ \alias{as_record_batch_reader.DataFrame} \title{polars to nanoarrow and arrow} \usage{ -as_nanoarrow_array_stream.DataFrame(x, ..., schema = NULL) +\method{as_nanoarrow_array_stream}{DataFrame}(x, ..., schema = NULL) -infer_nanoarrow_schema.DataFrame(x, ...) +\method{infer_nanoarrow_schema}{DataFrame}(x, ...) -as_arrow_table.DataFrame(x, ...) +\method{as_arrow_table}{DataFrame}(x, ...) -as_record_batch_reader.DataFrame(x, ..., schema = NULL) +\method{as_record_batch_reader}{DataFrame}(x, ..., schema = NULL) } \arguments{ \item{x}{a polars DataFrame} diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 6bc9a5551..4b159916c 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -158,7 +158,7 @@ test_that("DataFrame, custom schema", { # works fine if a variable is called "schema" expect_no_error( - pl$DataFrame(list(schema = 1), schema = list(schema = pl$Float32)) + pl$DataFrame(list(schema = 1), schema = list(schema = pl$Float32)) ) # errors if incorrect datatype expect_error(pl$DataFrame(x = 1, schema = list(schema = foo))) diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 8402c786a..b5688e81b 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -774,9 +774,9 @@ test_that("unnest", { df2 = df$ select( - pl$col("a", "b", "c")$to_struct()$alias("first_struct"), - pl$col("d", "e", "f")$to_struct()$alias("second_struct") - ) + pl$col("a", "b", "c")$to_struct()$alias("first_struct"), + pl$col("d", "e", "f")$to_struct()$alias("second_struct") + ) expect_identical( df2$unnest()$collect()$to_data_frame(), @@ -787,9 +787,9 @@ test_that("unnest", { df2$unnest("first_struct")$collect()$to_data_frame(), df$ select( - pl$col("a", "b", "c"), - pl$col("d", "e", "f")$to_struct()$alias("second_struct") - )$ + pl$col("a", "b", "c"), + pl$col("d", "e", "f")$to_struct()$alias("second_struct") + )$ collect()$ to_data_frame() ) From 14fd71fd14f2ef0b1a8cd2a00327bc95a0ef9084 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Sat, 7 Oct 2023 01:53:56 +0300 Subject: [PATCH 10/14] disallow following lazy args to a first eager --- R/functions__eager.R | 33 ++++++++++++++++++++++++++------- src/rust/src/utils/mod.rs | 4 ---- tests/testthat/test-concat.R | 21 ++++++++++++++++++++- 3 files changed, 46 insertions(+), 12 deletions(-) diff --git a/R/functions__eager.R b/R/functions__eager.R index 6b4adcdd8..5344bc621 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -1,6 +1,9 @@ #' Concat polars objects #' @name pl_concat -#' @param l list of DataFrame, or Series, LazyFrame or Expr +#' @param ... individual unpacked args or args wrappend in list(). Args should either be eager +#' as DataFrame, Series and R vectors, OR lazy as LazyFrame and Expr. If first argument is lazy +#' the return is LazyFrame, otherwise DataFrame. If returning a DataFrame all args must be eager, +#' to avoid some implicit collect. #' @param rechunk perform a rechunk at last #' @param how choice of bind direction "vertical"(rbind) "horizontal"(cbind) "diagonal" diagonally #' @param parallel Boolean default TRUE, only used for LazyFrames @@ -40,9 +43,8 @@ pl$concat = function( ..., # list of DataFrames or Series or lazyFrames or expr rechunk = TRUE, - how = c("vertical", "horizontal", "diagonal"), # , "vertical_relaxed","diangonal_relaxed"), + how = c("vertical", "horizontal", "diagonal"), parallel = TRUE, - # eager = FALSE, to_supertypes = FALSE) { # unpack arg list l = unpack_list(..., skip_classes = "data.frame") @@ -63,8 +65,22 @@ pl$concat = function( eager = !inherits(first, "LazyFrame") args_modified = names(as.list(sys.call()[-1L])) - # dispatch on item class and how + # check not using any mixing of types which could lead to implicit collect + if (eager) { + for (i in seq_along(l)) { + if (inherits(l[[i]], c("LazyFrame", "Expr"))) { + .pr$RPolarsErr$new()$ + plain("tip: explitcit collect lazy inputs first, e.g. pl$concat(dataframe, lazyframe$collect())")$ + plain("LazyFrame or Expr not allowed if first arg is a DataFrame, to avoid implicit collect")$ + bad_robj(l[[i]])$ + bad_arg(paste("of those to concatenate, number", i)) |> + Err() |> + unwrap("in pl$concat()") + } + } + } + # dispatch on item class and how Result_out = pcase( how == "vertical" && (inherits(first, "Series") || is.vector(first)), { @@ -114,13 +130,16 @@ pl$concat = function( # convert back from lazy if eager and_then(Result_out, \(x) { pcase( + # run-time assertion for future changes inherits(x, "DataFrame") && !eager, Err_plain("internal logical error in pl$concat()"), + + # must collect as in rust side only lazy concat is implemented. Eager inputs are wrapped in + # lazy and then collected again. This does not mean any user input is collected. inherits(x, "LazyFrame") && eager, Ok(x$collect()), or_else = Ok(x) ) - }) |> unwrap( - "in pl$concat()" - ) + }) |> + unwrap("in pl$concat()") } diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index 8d69f6138..aa28bb6eb 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -789,10 +789,6 @@ pub fn robj_to_dataframe(robj: extendr_api::Robj) -> RResult { let extptr_df: ExternalPtr = robj.try_into()?; Ok(extptr_df.0.clone()) } - _ if robj.inherits("LazyFrame") => { - let lf: ExternalPtr = robj.try_into()?; - lf.0.clone().collect() - } _ if robj.inherits("data.frame") => { let df = unpack_r_eval(R!("polars:::result(pl$DataFrame({{robj}}))"))?; let extptr_df: ExternalPtr = df.try_into()?; diff --git a/tests/testthat/test-concat.R b/tests/testthat/test-concat.R index 8851a6fc3..90e028d89 100644 --- a/tests/testthat/test-concat.R +++ b/tests/testthat/test-concat.R @@ -1,4 +1,24 @@ test_that("concat dataframe", { + # mixing lazy with first eager not allowed + ctx = pl$concat(pl$DataFrame(mtcars), pl$LazyFrame(mtcars), how = "vertical") |> get_err_ctx() + expect_true(endsWith(ctx$BadArgument, "number 2")) + expect_true(endsWith(ctx$PlainErrorMessage, "avoid implicit collect")) + + ctx = pl$concat(pl$DataFrame(mtcars), mtcars$hp, pl$lit(mtcars$mpg), how = "horizontal") |> + get_err_ctx() + expect_true(endsWith(ctx$BadArgument, "number 3")) + expect_true(endsWith(ctx$PlainErrorMessage, "avoid implicit collect")) + + # mixing eager with first lazy is allowd + df_ref = rbind(mtcars, mtcars) + row.names(df_ref) = 1:64 + expect_identical( + pl$concat(pl$LazyFrame(mtcars), pl$DataFrame(mtcars), how = "vertical")$ + collect()$ + to_data_frame(), + df_ref + ) + # vertical dfs l_ver = lapply(1:3, function(i) { l_internal = list( @@ -8,7 +28,6 @@ test_that("concat dataframe", { pl$DataFrame(l_internal) }) - df_ver = pl$concat(l_ver, how = "vertical") expect_equal( df_ver$to_data_frame(), From dd7bedb1ba3a4ffe52b02eb286a809996e63ac8b Mon Sep 17 00:00:00 2001 From: sorhawell Date: Sat, 7 Oct 2023 01:54:32 +0300 Subject: [PATCH 11/14] roxygen --- man/DataFrame_n_chunks.Rd | 8 ++++---- man/DataFrame_rechunk.Rd | 8 ++++---- man/pl_concat.Rd | 5 ++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/man/DataFrame_n_chunks.Rd b/man/DataFrame_n_chunks.Rd index 741043e79..d0f40c3a2 100644 --- a/man/DataFrame_n_chunks.Rd +++ b/man/DataFrame_n_chunks.Rd @@ -53,11 +53,11 @@ df$n_chunks() # The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with # same type of vectors and where have all relevant S3 generics implemented to make behave as if # it was a regular vector. -'+.chunked_vector' = \(x, y) structure(list(unlist(x) + unlist(y)),class = "chunked_vector") +"+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) -c.chunked_vector = \(...) structure(do.call(c,lapply(list(...),unclass)),class="chunked_vector") -rechunk = \(x) structure(unlist(x),class = "chunked_vector") -x = structure(list(1:4, 5L),class = "chunked_vector") +c.chunked_vector = \(...) structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +rechunk = \(x) structure(unlist(x), class = "chunked_vector") +x = structure(list(1:4, 5L), class = "chunked_vector") x x + 5:1 lapply(x, tracemem) # trace chunks to verify no re-allocation diff --git a/man/DataFrame_rechunk.Rd b/man/DataFrame_rechunk.Rd index 3f8c27a3d..a62a752cf 100644 --- a/man/DataFrame_rechunk.Rd +++ b/man/DataFrame_rechunk.Rd @@ -51,11 +51,11 @@ df$n_chunks() # The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with # same type of vectors and where have all relevant S3 generics implemented to make behave as if # it was a regular vector. -'+.chunked_vector' = \(x, y) structure(list(unlist(x) + unlist(y)),class = "chunked_vector") +"+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) -c.chunked_vector = \(...) structure(do.call(c,lapply(list(...),unclass)),class="chunked_vector") -rechunk = \(x) structure(unlist(x),class = "chunked_vector") -x = structure(list(1:4, 5L),class = "chunked_vector") +c.chunked_vector = \(...) structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +rechunk = \(x) structure(unlist(x), class = "chunked_vector") +x = structure(list(1:4, 5L), class = "chunked_vector") x x + 5:1 lapply(x, tracemem) # trace chunks to verify no re-allocation diff --git a/man/pl_concat.Rd b/man/pl_concat.Rd index 732596535..44fe13485 100644 --- a/man/pl_concat.Rd +++ b/man/pl_concat.Rd @@ -4,7 +4,10 @@ \alias{pl_concat} \title{Concat polars objects} \arguments{ -\item{l}{list of DataFrame, or Series, LazyFrame or Expr} +\item{...}{individual unpacked args or args wrappend in list(). Args should either be eager +as DataFrame, Series and R vectors, OR lazy as LazyFrame and Expr. If first argument is lazy +the return is LazyFrame, otherwise DataFrame. If returning a DataFrame all args must be eager, +to avoid some implicit collect.} \item{rechunk}{perform a rechunk at last} From 90d5ec3d8e575b7d4aa30b007154972eb50039a2 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Sat, 7 Oct 2023 10:01:32 +0300 Subject: [PATCH 12/14] reduce linewidth to 100 for two examples --- R/dataframe__frame.R | 8 ++++++-- man/DataFrame_n_chunks.Rd | 4 +++- man/DataFrame_rechunk.Rd | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 0efcd191d..0bd0e2773 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1079,7 +1079,9 @@ DataFrame_first = function() { #' # it was a regular vector. #' "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") #' print.chunked_vector = \(x, ...) print(unlist(x), ...) -#' c.chunked_vector = \(...) structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +#' c.chunked_vector = \(...) { +#' structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +#' } #' rechunk = \(x) structure(unlist(x), class = "chunked_vector") #' x = structure(list(1:4, 5L), class = "chunked_vector") #' x @@ -1143,7 +1145,9 @@ DataFrame_n_chunks = function(strategy = "all") { #' # it was a regular vector. #' "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") #' print.chunked_vector = \(x, ...) print(unlist(x), ...) -#' c.chunked_vector = \(...) structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +#' c.chunked_vector = \(...) { +#' structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +#' } #' rechunk = \(x) structure(unlist(x), class = "chunked_vector") #' x = structure(list(1:4, 5L), class = "chunked_vector") #' x diff --git a/man/DataFrame_n_chunks.Rd b/man/DataFrame_n_chunks.Rd index d0f40c3a2..7c5ddd526 100644 --- a/man/DataFrame_n_chunks.Rd +++ b/man/DataFrame_n_chunks.Rd @@ -55,7 +55,9 @@ df$n_chunks() # it was a regular vector. "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) -c.chunked_vector = \(...) structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +c.chunked_vector = \(...) { + structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +} rechunk = \(x) structure(unlist(x), class = "chunked_vector") x = structure(list(1:4, 5L), class = "chunked_vector") x diff --git a/man/DataFrame_rechunk.Rd b/man/DataFrame_rechunk.Rd index a62a752cf..592cc15aa 100644 --- a/man/DataFrame_rechunk.Rd +++ b/man/DataFrame_rechunk.Rd @@ -53,7 +53,9 @@ df$n_chunks() # it was a regular vector. "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) -c.chunked_vector = \(...) structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +c.chunked_vector = \(...) { + structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +} rechunk = \(x) structure(unlist(x), class = "chunked_vector") x = structure(list(1:4, 5L), class = "chunked_vector") x From c8a2abeac99c89a0ba9d9240b91adaa4a7da3b65 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Sun, 8 Oct 2023 14:59:51 +0300 Subject: [PATCH 13/14] move rs import statement + news --- NEWS.md | 1 + src/rust/src/rdataframe/mod.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 84ba3446a..455509114 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,7 @@ ## What's changed +- `pl$concat()` now also supports `Series`, `Expr` and `LazyFrame` (#407). - New method `$unnest()` for `LazyFrame` (#397). - New method `$sample()` for `DataFrame` (#399). - New method `$meta$tree_format()` to display an `Expr` as a tree (#401). diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 2e4c77433..4bccc8a87 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -9,7 +9,7 @@ use crate::lazy; use crate::rdatatype; use crate::rdatatype::RPolarsDataType; use crate::robj_to; -use crate::rpolarserr::{polars_to_rpolars_err, RResult}; +use crate::rpolarserr::*; pub use lazy::dataframe::*; @@ -77,7 +77,7 @@ impl From for DataFrame { DataFrame(item) } } -use crate::rpolarserr::*; + #[extendr] impl DataFrame { pub fn shape(&self) -> Robj { From a53620e3f9dc61cc696197cb002d66d371ec6cdc Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Mon, 9 Oct 2023 09:29:13 +0200 Subject: [PATCH 14/14] improve docs --- R/dataframe__frame.R | 104 ++++++++++---------------------------- R/functions__eager.R | 38 +++++++++----- man/DataFrame_n_chunks.Rd | 43 ++++++++-------- man/DataFrame_rechunk.Rd | 40 +++++++-------- man/pl_concat.Rd | 36 +++++++++---- 5 files changed, 120 insertions(+), 141 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 0bd0e2773..18f970061 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1033,33 +1033,35 @@ DataFrame_first = function() { } -#' @title Number of chunks of the Series' in a DataFrame -#' @description Number of chunks (mem allocations) for all or first Series' in a DataFrame. +#' @title Number of chunks of the Series in a DataFrame +#' @description +#' Number of chunks (memory allocations) for all or first Series in a DataFrame. #' @details -#' DataFrame is a vector of Series'. Each Series in rust-polars is a wrapper around a ChunkedArray, -#' which is like a virtual contiguous vector, which is physically backed by an ordered set of -#' chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays -#' are a fast, thread-safe and cross-platform memory layout. +#' A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper +#' around a ChunkedArray, which is like a virtual contiguous vector physically +#' backed by an ordered set of chunks. Each chunk of values has a contiguous +#' memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and +#' cross-platform memory layout. #' -#' In R when combining with `c()` or `rbind()`, it requires immediate vector re-allocation to place -#' vector values in contiguous memory, hence repeated appending to a vector in R is discouraged -#' because slow. +#' In R, combining with `c()` or `rbind()` requires immediate vector re-allocation +#' to place vector values in contiguous memory. This is slow and memory consuming, +#' and it is why repeatedly appending to a vector in R is discouraged. #' -#' When concatenating/appending to Series' or DataFrame' re-allocation can be avoided or delayed, -#' by simply appending chunks to each individual Series. -#' -#' However, if chunks become many and small or are misaligned across Series, this can hurt the -#' performance of following operations. +#' In polars, when we concatenate or append to Series or DataFrame, the +#' re-allocation can be avoided or delayed by simply appending chunks to each +#' individual Series. However, if chunks become many and small or are misaligned +#' across Series, this can hurt the performance of subsequent operations. #' #' Most places in the polars api where chunking could occur, the user have to -#' typically actively opt-out by setting and arg `rechunk = FALSE`. Thus it is not normal +#' typically actively opt-out by setting an argument `rechunk = FALSE`. #' #' @keywords DataFrame -#' @param strategy string either 'all' or 'first'. 'first' only returns chunks for first Series. -#' @return real vector of chunk counts per Series. +#' @param strategy Either `"all"` or `"first"`. `"first"` only returns chunks +#' for the first Series. +#' @return A real vector of chunk counts per Series. #' @seealso [`$rechunk()`][DataFrame_rechunk] #' @examples -#' # create DataFrame with miss aligned chunks +#' # create DataFrame with misaligned chunks #' df = pl$concat( #' 1:10, # single chunk #' pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks @@ -1074,9 +1076,9 @@ DataFrame_first = function() { #' # rechunk is not an in-place operation #' df$n_chunks() #' -#' # The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with -#' # same type of vectors and where have all relevant S3 generics implemented to make behave as if -#' # it was a regular vector. +#' # The following toy example emulates the Series "chunkyness" in R. Here it a +#' # S3-classed list with same type of vectors and where have all relevant S3 +#' # generics implemented to make behave as if it was a regular vector. #' "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") #' print.chunked_vector = \(x, ...) print(unlist(x), ...) #' c.chunked_vector = \(...) { @@ -1100,65 +1102,13 @@ DataFrame_n_chunks = function(strategy = "all") { #' @title Rechunk a DataFrame -#' @description Rechunking re-allocates any "chunked" memory allocations to speed-up e.g. vectorized -#' operations. -#' @details -#' DataFrame is a vector of Series'. Each Series in rust-polars is a wrapper around a ChunkedArray, -#' which is like a virtual contiguous vector, which is physically backed by an ordered set of -#' chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays -#' are a fast, thread-safe and cross-platform memory layout. -#' -#' In R when combining with `c()` or `rbind()`, it requires immediate vector re-allocation to place -#' vector values in contiguous memory, hence repeated appending to a vector in R is discouraged -#' because slow. -#' -#' When concatenating/appending to Series' or DataFrame' re-allocation can be avoided or delayed, -#' by simply appending chunks to each individual Series. -#' -#' However, if chunks become many and small or are misaligned across Series, this can hurt the -#' performance of following operations. -#' -#' Most places in the polars api where chunking could occur, the user have to -#' typically actively opt-out by setting and arg `rechunk = FALSE`. Thus it is not normal +#' @description Rechunking re-allocates any "chunked" memory allocations to +#' speed-up e.g. vectorized operations. +#' @inherit DataFrame_n_chunks details examples #' #' @keywords DataFrame -#' @return real vector of chunk counts per Series. +#' @return A DataFrame #' @seealso [`$n_chunks()`][DataFrame_n_chunks] -#' @examples -#' # create DataFrame with miss alligned chunks -#' df = pl$concat( -#' 1:10, # single chunk -#' pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks -#' how = "horizontal" -#' ) -#' df -#' df$n_chunks() -#' -#' # rechunk a chunked DataFrame -#' df$rechunk()$n_chunks() -#' -#' # rechunk is not an in-place operation -#' df$n_chunks() -#' -#' # The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with -#' # same type of vectors and where have all relevant S3 generics implemented to make behave as if -#' # it was a regular vector. -#' "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") -#' print.chunked_vector = \(x, ...) print(unlist(x), ...) -#' c.chunked_vector = \(...) { -#' structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") -#' } -#' rechunk = \(x) structure(unlist(x), class = "chunked_vector") -#' x = structure(list(1:4, 5L), class = "chunked_vector") -#' x -#' x + 5:1 -#' lapply(x, tracemem) # trace chunks to verify no re-allocation -#' z = c(x, x) -#' z # looks like a plain vector -#' lapply(z, tracemem) # mem allocation in z are the same from x -#' str(z) -#' z = rechunk(z) -#' str(z) DataFrame_rechunk = function() { .pr$DataFrame$rechunk(self) } diff --git a/R/functions__eager.R b/R/functions__eager.R index 5344bc621..d925d8582 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -1,17 +1,24 @@ #' Concat polars objects #' @name pl_concat -#' @param ... individual unpacked args or args wrappend in list(). Args should either be eager -#' as DataFrame, Series and R vectors, OR lazy as LazyFrame and Expr. If first argument is lazy -#' the return is LazyFrame, otherwise DataFrame. If returning a DataFrame all args must be eager, -#' to avoid some implicit collect. -#' @param rechunk perform a rechunk at last -#' @param how choice of bind direction "vertical"(rbind) "horizontal"(cbind) "diagonal" diagonally -#' @param parallel Boolean default TRUE, only used for LazyFrames -#' @param to_supertypes Boolean default TRUE, cast columns shared super types, if any. +#' @param ... Either individual unpacked args or args wrapped in list(). Args can +#' be eager as DataFrame, Series and R vectors, or lazy as LazyFrame and Expr. +#' The first element determines the output of `$concat()`: if the first element +#' is lazy, a LazyFrame is returned; otherwise, a DataFrame is returned (note +#' that if the first element is eager, all other elements have to be eager to +#' avoid implicit collect). +#' @param rechunk Perform a rechunk at last. +#' @param how Bind direction. Can be "vertical" (like `rbind()`), "horizontal" +#' (like `cbind()`), or "diagonal". +#' @param parallel Only used for LazyFrames. If `TRUE` (default), lazy +#' computations may be executed in parallel. +#' @param to_supertypes If `TRUE` (default), cast columns shared super types, if +#' any. For example, if we try to vertically concatenate two columns of types `i32` +#' and `f64`, the column of type `i32` will be cast to `f64` beforehand. This +#' argument is equivalent to the "_relaxed" operations in Python polars. #' #' @details -#' Categorical columns/Series must have been constructed while global string cache enabled -#' [`pl$enable_string_cache()`][pl_enable_string_cache] +#' Categorical columns/Series must have been constructed while global string +#' cache enabled. See [`pl$enable_string_cache()`][pl_enable_string_cache]. #' #' #' @return DataFrame, or Series, LazyFrame or Expr @@ -27,7 +34,6 @@ #' }) #' pl$concat(l_ver, how = "vertical") #' -#' #' # horizontal #' l_hor = lapply(1:10, function(i) { #' l_internal = list( @@ -38,8 +44,16 @@ #' pl$DataFrame(l_internal) #' }) #' pl$concat(l_hor, how = "horizontal") +#' #' # diagonal #' pl$concat(l_hor, how = "diagonal") +#' +#' # if two columns don't share the same type, concat() will error unless we use +#' # `to_supertypes = TRUE`: +#' test = pl$DataFrame(x = 1L) # i32 +#' test2 = pl$DataFrame(x = 1.0) #f64 +#' +#' pl$concat(test, test2, to_supertypes = TRUE) pl$concat = function( ..., # list of DataFrames or Series or lazyFrames or expr rechunk = TRUE, @@ -70,7 +84,7 @@ pl$concat = function( for (i in seq_along(l)) { if (inherits(l[[i]], c("LazyFrame", "Expr"))) { .pr$RPolarsErr$new()$ - plain("tip: explitcit collect lazy inputs first, e.g. pl$concat(dataframe, lazyframe$collect())")$ + plain("tip: explicitly collect lazy inputs first, e.g. pl$concat(dataframe, lazyframe$collect())")$ plain("LazyFrame or Expr not allowed if first arg is a DataFrame, to avoid implicit collect")$ bad_robj(l[[i]])$ bad_arg(paste("of those to concatenate, number", i)) |> diff --git a/man/DataFrame_n_chunks.Rd b/man/DataFrame_n_chunks.Rd index 7c5ddd526..4fe2e50cc 100644 --- a/man/DataFrame_n_chunks.Rd +++ b/man/DataFrame_n_chunks.Rd @@ -2,40 +2,41 @@ % Please edit documentation in R/dataframe__frame.R \name{DataFrame_n_chunks} \alias{DataFrame_n_chunks} -\title{Number of chunks of the Series' in a DataFrame} +\title{Number of chunks of the Series in a DataFrame} \usage{ DataFrame_n_chunks(strategy = "all") } \arguments{ -\item{strategy}{string either 'all' or 'first'. 'first' only returns chunks for first Series.} +\item{strategy}{Either \code{"all"} or \code{"first"}. \code{"first"} only returns chunks +for the first Series.} } \value{ -real vector of chunk counts per Series. +A real vector of chunk counts per Series. } \description{ -Number of chunks (mem allocations) for all or first Series' in a DataFrame. +Number of chunks (memory allocations) for all or first Series in a DataFrame. } \details{ -DataFrame is a vector of Series'. Each Series in rust-polars is a wrapper around a ChunkedArray, -which is like a virtual contiguous vector, which is physically backed by an ordered set of -chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays -are a fast, thread-safe and cross-platform memory layout. +A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper +around a ChunkedArray, which is like a virtual contiguous vector physically +backed by an ordered set of chunks. Each chunk of values has a contiguous +memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and +cross-platform memory layout. -In R when combining with \code{c()} or \code{rbind()}, it requires immediate vector re-allocation to place -vector values in contiguous memory, hence repeated appending to a vector in R is discouraged -because slow. +In R, combining with \code{c()} or \code{rbind()} requires immediate vector re-allocation +to place vector values in contiguous memory. This is slow and memory consuming, +and it is why repeatedly appending to a vector in R is discouraged. -When concatenating/appending to Series' or DataFrame' re-allocation can be avoided or delayed, -by simply appending chunks to each individual Series. - -However, if chunks become many and small or are misaligned across Series, this can hurt the -performance of following operations. +In polars, when we concatenate or append to Series or DataFrame, the +re-allocation can be avoided or delayed by simply appending chunks to each +individual Series. However, if chunks become many and small or are misaligned +across Series, this can hurt the performance of subsequent operations. Most places in the polars api where chunking could occur, the user have to -typically actively opt-out by setting and arg \code{rechunk = FALSE}. Thus it is not normal +typically actively opt-out by setting an argument \code{rechunk = FALSE}. } \examples{ -# create DataFrame with miss aligned chunks +# create DataFrame with misaligned chunks df = pl$concat( 1:10, # single chunk pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks @@ -50,9 +51,9 @@ df$rechunk()$n_chunks() # rechunk is not an in-place operation df$n_chunks() -# The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with -# same type of vectors and where have all relevant S3 generics implemented to make behave as if -# it was a regular vector. +# The following toy example emulates the Series "chunkyness" in R. Here it a +# S3-classed list with same type of vectors and where have all relevant S3 +# generics implemented to make behave as if it was a regular vector. "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) c.chunked_vector = \(...) { diff --git a/man/DataFrame_rechunk.Rd b/man/DataFrame_rechunk.Rd index 592cc15aa..25606064c 100644 --- a/man/DataFrame_rechunk.Rd +++ b/man/DataFrame_rechunk.Rd @@ -7,33 +7,33 @@ DataFrame_rechunk() } \value{ -real vector of chunk counts per Series. +A DataFrame } \description{ -Rechunking re-allocates any "chunked" memory allocations to speed-up e.g. vectorized -operations. +Rechunking re-allocates any "chunked" memory allocations to +speed-up e.g. vectorized operations. } \details{ -DataFrame is a vector of Series'. Each Series in rust-polars is a wrapper around a ChunkedArray, -which is like a virtual contiguous vector, which is physically backed by an ordered set of -chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays -are a fast, thread-safe and cross-platform memory layout. +A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper +around a ChunkedArray, which is like a virtual contiguous vector physically +backed by an ordered set of chunks. Each chunk of values has a contiguous +memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and +cross-platform memory layout. -In R when combining with \code{c()} or \code{rbind()}, it requires immediate vector re-allocation to place -vector values in contiguous memory, hence repeated appending to a vector in R is discouraged -because slow. +In R, combining with \code{c()} or \code{rbind()} requires immediate vector re-allocation +to place vector values in contiguous memory. This is slow and memory consuming, +and it is why repeatedly appending to a vector in R is discouraged. -When concatenating/appending to Series' or DataFrame' re-allocation can be avoided or delayed, -by simply appending chunks to each individual Series. - -However, if chunks become many and small or are misaligned across Series, this can hurt the -performance of following operations. +In polars, when we concatenate or append to Series or DataFrame, the +re-allocation can be avoided or delayed by simply appending chunks to each +individual Series. However, if chunks become many and small or are misaligned +across Series, this can hurt the performance of subsequent operations. Most places in the polars api where chunking could occur, the user have to -typically actively opt-out by setting and arg \code{rechunk = FALSE}. Thus it is not normal +typically actively opt-out by setting an argument \code{rechunk = FALSE}. } \examples{ -# create DataFrame with miss alligned chunks +# create DataFrame with misaligned chunks df = pl$concat( 1:10, # single chunk pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks @@ -48,9 +48,9 @@ df$rechunk()$n_chunks() # rechunk is not an in-place operation df$n_chunks() -# The following toy example emulates the Series "chunkyness" in R. Here it a S3-classed list with -# same type of vectors and where have all relevant S3 generics implemented to make behave as if -# it was a regular vector. +# The following toy example emulates the Series "chunkyness" in R. Here it a +# S3-classed list with same type of vectors and where have all relevant S3 +# generics implemented to make behave as if it was a regular vector. "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) c.chunked_vector = \(...) { diff --git a/man/pl_concat.Rd b/man/pl_concat.Rd index 44fe13485..227bebb70 100644 --- a/man/pl_concat.Rd +++ b/man/pl_concat.Rd @@ -4,18 +4,25 @@ \alias{pl_concat} \title{Concat polars objects} \arguments{ -\item{...}{individual unpacked args or args wrappend in list(). Args should either be eager -as DataFrame, Series and R vectors, OR lazy as LazyFrame and Expr. If first argument is lazy -the return is LazyFrame, otherwise DataFrame. If returning a DataFrame all args must be eager, -to avoid some implicit collect.} +\item{...}{Either individual unpacked args or args wrapped in list(). Args can +be eager as DataFrame, Series and R vectors, or lazy as LazyFrame and Expr. +The first element determines the output of \verb{$concat()}: if the first element +is lazy, a LazyFrame is returned; otherwise, a DataFrame is returned (note +that if the first element is eager, all other elements have to be eager to +avoid implicit collect).} -\item{rechunk}{perform a rechunk at last} +\item{rechunk}{Perform a rechunk at last.} -\item{how}{choice of bind direction "vertical"(rbind) "horizontal"(cbind) "diagonal" diagonally} +\item{how}{Bind direction. Can be "vertical" (like \code{rbind()}), "horizontal" +(like \code{cbind()}), or "diagonal".} -\item{parallel}{Boolean default TRUE, only used for LazyFrames} +\item{parallel}{Only used for LazyFrames. If \code{TRUE} (default), lazy +computations may be executed in parallel.} -\item{to_supertypes}{Boolean default TRUE, cast columns shared super types, if any.} +\item{to_supertypes}{If \code{TRUE} (default), cast columns shared super types, if +any. For example, if we try to vertically concatenate two columns of types \code{i32} +and \code{f64}, the column of type \code{i32} will be cast to \code{f64} beforehand. This +argument is equivalent to the "_relaxed" operations in Python polars.} } \value{ DataFrame, or Series, LazyFrame or Expr @@ -24,8 +31,8 @@ DataFrame, or Series, LazyFrame or Expr Concat polars objects } \details{ -Categorical columns/Series must have been constructed while global string cache enabled -\code{\link[=pl_enable_string_cache]{pl$enable_string_cache()}} +Categorical columns/Series must have been constructed while global string +cache enabled. See \code{\link[=pl_enable_string_cache]{pl$enable_string_cache()}}. } \examples{ # vertical @@ -38,7 +45,6 @@ l_ver = lapply(1:10, function(i) { }) pl$concat(l_ver, how = "vertical") - # horizontal l_hor = lapply(1:10, function(i) { l_internal = list( @@ -49,6 +55,14 @@ l_hor = lapply(1:10, function(i) { pl$DataFrame(l_internal) }) pl$concat(l_hor, how = "horizontal") + # diagonal pl$concat(l_hor, how = "diagonal") + +# if two columns don't share the same type, concat() will error unless we use +# `to_supertypes = TRUE`: +test = pl$DataFrame(x = 1L) # i32 +test2 = pl$DataFrame(x = 1.0) #f64 + +pl$concat(test, test2, to_supertypes = TRUE) }