Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for pl$concat(<LazyFrame>, . . . ) + add to_supertypes auto casting #407

Merged
merged 18 commits into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1057,6 +1057,25 @@ DataFrame_first = function() {
self$lazy()$first()$collect()
}


#' @title Get the number of chunks of the Series' in a DataFrame
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
#' @keywords DataFrame
#' @param strategy string either 'all' or 'first'. 'first' only returns chunks for first Series.
#' @return real vector of chunk counts per Series.
#' @examples
#' df = pl$concat(
#' 1:10,
#' pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"),
#' how = "horizontal"
#' )
#' df
#' df$n_chunks()
DataFrame_n_chunks = function(strategy = "all") {
.pr$DataFrame$n_chunks(self, strategy) |>
unwrap("in n_chunks():")
}


#' @title Get the last row of the DataFrame.
#' @keywords DataFrame
#' @return A DataFrame with one row.
Expand Down
13 changes: 10 additions & 3 deletions R/error__rpolarserr.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,19 @@ bad_robj = function(r) {
.pr$RPolarsErr$new()$bad_robj(r)
}

Err_plain = function(x) {
Err(.pr$RPolarsErr$new()$plain(x))
Err_plain = function(...) {
Err(.pr$RPolarsErr$new()$plain(paste(..., collapse = " ")))
}

# short hand for extracting an error context in unit testing, will raise error if not an RPolarsErr
get_err_ctx = \(x) unwrap_err(result(x))$contexts()
get_err_ctx = \(x, select = NULL) {
ctx = unwrap_err(result(x))$contexts()
if (is.null(select)) {
ctx
} else {
ctx[[match.arg(select, names(ctx))]]
}
}


# wrapper to return Result
Expand Down
3 changes: 1 addition & 2 deletions R/expr__meta.R
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,8 @@ ExprMeta_is_regex_projection = function() {
#' @examples
#' my_expr = (pl$col("foo") * pl$col("bar"))$sum()$over(pl$col("ham")) / 2
#' my_expr$meta$tree_format()

ExprMeta_tree_format = function(return_as_string = FALSE) {
out <- .pr$Expr$meta_tree_format(self) |>
out = .pr$Expr$meta_tree_format(self) |>
unwrap("in $tree_format():")
if (isTRUE(return_as_string)) {
out
Expand Down
28 changes: 16 additions & 12 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,6 @@
#' @useDynLib polars, .registration = TRUE
NULL

rlazy_csv_reader <- function(path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) .Call(wrap__rlazy_csv_reader, path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates)

import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_count, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_count, memmap)

new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory)

concat_df <- function(vdf) .Call(wrap__concat_df, vdf)

hor_concat_df <- function(dfs) .Call(wrap__hor_concat_df, dfs)

diag_concat_df <- function(dfs) .Call(wrap__diag_concat_df, dfs)

min_exprs <- function(exprs) .Call(wrap__min_exprs, exprs)

max_exprs <- function(exprs) .Call(wrap__max_exprs, exprs)
Expand Down Expand Up @@ -75,6 +63,20 @@ test_wrong_call_pl_lit <- function(robj) .Call(wrap__test_wrong_call_pl_lit, rob

polars_features <- function() .Call(wrap__polars_features)

concat_lf <- function(l, rechunk, parallel, to_supertypes) .Call(wrap__concat_lf, l, rechunk, parallel, to_supertypes)

diag_concat_lf <- function(l, rechunk, parallel) .Call(wrap__diag_concat_lf, l, rechunk, parallel)

hor_concat_df <- function(l) .Call(wrap__hor_concat_df, l)

concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, l, rechunk, to_supertypes)

rlazy_csv_reader <- function(path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) .Call(wrap__rlazy_csv_reader, path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates)

import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_count, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_count, memmap)

new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory)

test_rpolarserr <- function() .Call(wrap__test_rpolarserr)

setup_renv <- function() .Call(wrap__setup_renv)
Expand Down Expand Up @@ -111,6 +113,8 @@ DataFrame <- new.env(parent = emptyenv())

DataFrame$shape <- function() .Call(wrap__DataFrame__shape, self)

DataFrame$n_chunks <- function(strategy) .Call(wrap__DataFrame__n_chunks, self, strategy)

DataFrame$clone_see_me_macro <- function() .Call(wrap__DataFrame__clone_see_me_macro, self)

DataFrame$default <- function() .Call(wrap__DataFrame__default)
Expand Down
95 changes: 71 additions & 24 deletions R/functions__eager.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
#' @param l list of DataFrame, or Series, LazyFrame or Expr
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
#' @param rechunk perform a rechunk at last
#' @param how choice of bind direction "vertical"(rbind) "horizontal"(cbind) "diagonal" diagonally
#' @param parallel BOOL default TRUE, only used for LazyFrames
#' @param parallel Boolean default TRUE, only used for LazyFrames
#' @param to_supertypes Boolean default TRUE, cast columns shared super types, if any.
#'
#' @details
#' Categorical columns/Series must have been constructed while global string cache enabled
Expand Down Expand Up @@ -37,41 +38,88 @@
#' # diagonal
#' pl$concat(l_hor, how = "diagonal")
pl$concat = function(
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
l, # list of DataFrames or Series or lazyFrames or expr
..., # list of DataFrames or Series or lazyFrames or expr
rechunk = TRUE,
how = c("vertical", "horizontal", "diagonal"),
parallel = TRUE # not used yet
) {
how = c("vertical", "horizontal", "diagonal"), # , "vertical_relaxed","diangonal_relaxed"),
parallel = TRUE,
# eager = FALSE,
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
to_supertypes = FALSE) {
if (exists("do_browse", .GlobalEnv) && do_browse) browser()
sorhawell marked this conversation as resolved.
Show resolved Hide resolved

# unpack arg list
l = unpack_list(..., skip_classes = "data.frame")

## Check inputs
how = match.arg(how[1L], c("vertical", "horizontal", "diagonal"))
how_args = c("vertical", "horizontal", "diagonal") # , "vertical_relaxed", "diangonal_relaxed")

how = match.arg(how[1L], how_args) |>
result() |>
unwrap("in pl$concat()")

# dispatch on item class and how
first = l[[1L]]
result = pcase(
inherits(first, "DataFrame"),
eager = !inherits(first, "LazyFrame")
args_modified = names(as.list(sys.call()[-1L]))

# dispatch on item class and how

Result_out = pcase(
how == "vertical" && (inherits(first, "Series") || is.vector(first)),
{
vdf = l_to_vdf(l)
pcase(
how == "vertical", concat_df(vdf),
how == "diagonal", diag_concat_df(vdf),
how == "horizontal", hor_concat_df(vdf),
or_else = stopf("Internal error")
)
if (any(args_modified %in% c("parallel"))) {
warning(
"in pl:concat(): args: parallel takes no effect when concatenating Series",
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
call. = FALSE
)
}
concat_series(l, rechunk, to_supertypes)
},
how == "vertical",
concat_lf(l, rechunk, parallel, to_supertypes),
how == "diagonal",
{
if (any(args_modified %in% c("to_supertypes"))) {
warning(
"Args to_supertypes",
"takes no effect for how=='diagonal'",
call. = FALSE
)
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
}
diag_concat_lf(l, rechunk, parallel)
},
inherits(first, "Series"),
how == "horizontal" && !eager,
{
stopf("not implemented Series")
Err_plain(
"how=='horizontal' is not supported for lazy (first element is LazyFrame).",
"Try e.g. <LazyFrame>$join() to get Lazy join or pl$concat(lf1$collect(),lf2,lf3).",
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
"to get a eager horizontal concatenation"
)
},
inherits(first, "Expr"),
how == "horizontal",
{
stopf("not implemented Expr")
if (any(args_modified %in% c("parallel", "to_supertypes"))) {
warning(
"Args parallel, rechunk, eager and to_supertypes",
"takes no effect for how=='horizontal'",
call. = FALSE
)
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
}
hor_concat_df(l)
},

# TODO implement Series, Expr, Lazy etc
or_else = stopf(paste0("type of first list element: '", class(first), "' is not supported"))
or_else = Err_plain("internal error:", how, "not handled")
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
)

unwrap(result)
# convert back from lazy if eager
and_then(Result_out, \(x) {
pcase(
inherits(x, "DataFrame") && !eager, Err_plain("internal logical error in pl$concat()"),
inherits(x, "LazyFrame") && eager, Ok(x$collect()),
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
or_else = Ok(x)
)
}) |> unwrap(
"in pl$concat()"
)
}


Expand Down Expand Up @@ -136,8 +184,7 @@ pl$date_range = function(
name = NULL, # : str | None = None,
time_unit = "us",
time_zone = NULL, # : str | None = None
explode = TRUE
) {
explode = TRUE) {
if (missing(end)) {
end = start
interval = "1h"
Expand Down
11 changes: 5 additions & 6 deletions R/lazyframe__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -1319,21 +1319,20 @@ LazyFrame_clone = function() {
#' b = c("one", "two", "three", "four", "five"),
#' c = 6:10
#' )$
#' select(
#' pl$col("b")$to_struct(),
#' pl$col("a", "c")$to_struct()$alias("a_and_c")
#' )
#' select(
#' pl$col("b")$to_struct(),
#' pl$col("a", "c")$to_struct()$alias("a_and_c")
#' )
#' lf$collect()
#'
#' # by default, all struct columns are unnested
#' lf$unnest()$collect()
#'
#' # we can specify specific columns to unnest
#' lf$unnest("a_and_c")$collect()

LazyFrame_unnest = function(names = NULL) {
if (is.null(names)) {
names <- names(which(dtypes_are_struct(.pr$LazyFrame$schema(self)$ok)))
names = names(which(dtypes_are_struct(.pr$LazyFrame$schema(self)$ok)))
}
unwrap(.pr$LazyFrame$unnest(self, names), "in $unnest():")
}
2 changes: 1 addition & 1 deletion R/series__series.R
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ Series_dtype = method_as_property(function() {
#' @keywords Series
#' @return DataType
#' @aliases Series_flags
#' @name Series_dtype
#' @name Series_flags
#' @details property sorted flags are not settable, use set_sorted
#' @examples
#' pl$Series(1:4)$sort()$flags
Expand Down
9 changes: 7 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ list2 = list
#' Internal unpack list
#' @noRd
#' @param l any list
#' @param skip_classes char vec, do not unpack list inherits skip_classes.
#' @details py-polars syntax only allows e.g. `df.select([expr1, expr2,])` and not
#' `df.select(expr1, expr2,)`. r-polars also allows user to directly write
#' `df$select(expr1, expr2)` or `df$select(list(expr1,expr2))`. Unpack list
Expand All @@ -103,9 +104,13 @@ list2 = list
#' f = \(...) unpack_list(list(...))
#' identical(f(list(1L, 2L, 3L)), f(1L, 2L, 3L)) # is TRUE
#' identical(f(list(1L, 2L), 3L), f(1L, 2L, 3L)) # is FALSE
unpack_list = function(...) {
unpack_list = function(..., skip_classes = NULL) {
l = list2(...)
if (length(l) == 1L && is.list(l[[1L]])) {
if (
length(l) == 1L &&
is.list(l[[1L]]) &&
!(!is.null(skip_classes) && inherits(l[[1L]], skip_classes))
) {
l[[1L]]
} else {
l
Expand Down
27 changes: 27 additions & 0 deletions man/DataFrame_n_chunks.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 0 additions & 11 deletions man/Series_dtype.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions man/Series_flags.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions man/nanoarrow.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading