Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new with optimization to allow avoid [ overhead #4488

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,33 @@ replace_dot_alias = function(e) {
on.exit(options(oldverbose))
}
.global$print=""
ColeMiller1 marked this conversation as resolved.
Show resolved Hide resolved
# with2 #4485
if (!missing(with) && (length(with)>1L || !is.null(names(with)))) {
w = wither(with)
# this checks for valid negation usage in 'j' should be removed after some time, it is here to ensure newcomers to this feature will use it correctly
if (isFALSE(w[["j"]]) && !identical(as.integer(getOption("datatable.with",1L)), 2L) && hasNot(subj<-substitute(j)) &&
!with_j_valid(subj, env=parent.frame(), verbose=verbose)) stop("internal error: with_j_valid should have raised an error already") # nocov #w$j = NA
nr = nrow(x)
nc = length(x)
if (isFALSE(w[["i"]]) && !missing(i)) i = with_i(i, len=nr, verbose=verbose)
if (isFALSE(w[["j"]]) && !missing(j)) j = with_j(j, len=nc, x=x, verbose=verbose)
if ((isFALSE(w[["i"]]) && missing(j)) || (isFALSE(w[["j"]]) && missing(i)) || (isFALSE(w[["i"]]) && isFALSE(w[["j"]]))) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a bit hard to figure out what this branch is for

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from body of this branch one can see that branch is for early return and escape rest of [ processing

if (missing(i)) i = seq_len(nr)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seq_len(nr) is only needed on the which logic branch. Otherwise, we could do i = NULL

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but eventually subsetDT could not copy when NULL provided, although it does copy now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main point being that if missing(i), this would realize an integer vector seq_len(nr) when it is unnecessary except for the which() branch.

Regarding allowing shallow copies in CsubsetDT, that would be a nice feature. A data.frame does not appear to make a copy when selecting columns. At least that's what memory profiling using bench::mark suggests.

if (missing(j)) j = seq_len(nc)
if (verbose)
cat("with=FALSE short-circuit return\n")
if (!isFALSE(which)) {
return(if (isTRUE(which)) i else {
ii = seq_len(nr)
ii[!ii%in%i]
})
} else {
return(.Call(CsubsetDT, x, i, j))
}
} else {
with = w[["j"]]
}
}
missingby = missing(by) && missing(keyby) # for tests 359 & 590 where passing by=NULL results in data.table not vector
if (!missing(keyby)) {
if (!missing(by)) stop("Provide either by= or keyby= but not both")
Expand Down
70 changes: 70 additions & 0 deletions R/with.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# `i` argument could get with=FALSE #4485
wither = function(with) {
w = list(i=NA, j=NA)
if (!is.logical(with)) stop("'with' must be logical")
lw = length(with)
if (lw!=1L && lw!=2L) stop("'with' must be length 1 or 2")
nw = names(with)
if (!is.null(nw)) {
nw = unique(nw)
if (length(nw)!=lw) stop("'with' names must be unique")
if (any(!nw%chin%c("i","j"))) stop("'with' names must be 'i' and/or 'j'")
w[["i"]] = with["i"]
w[["j"]] = with["j"]
} else {
if (lw==1L) {
#w[["j"]] = with ## we don't do this line for backward compatibility, new 'with' optimization kicks in only for length 2 'with' or named 'with', been escaped already thus # nocov
} else {
w[["i"]] = with[1L]
w[["j"]] = with[2L]
}
}
w
}
hasNot = function(x) x%iscall%"!" || (x%iscall%"-" && length(x)==2L)
with_j_valid = function(subj, env, verbose) {
bang = subj %iscall% "!"
j = eval(subj[[2L]], env)
j_ok = (is.logical(j) && bang) || (is.numeric(j) && !bang)
if (!j_ok) {
msg = "Your 'j' argument is"
if (is.logical(j)) msg = paste(msg, "logical but has been used with unary '-', please change it to use '!' instead.")
else if (is.numeric(j)) msg = paste(msg, "numeric but has been used with '!', please change it to use unary '-' instead.")
else if (is.character(j)) msg = paste(msg, "character, please change it to use '!names(x) %in% j' instead.")
else msg = paste(msg, "neither logical, numeric or character, please change it to be one of those.")
stop(paste(msg, "Or simply not use new 'with' optimization by providing scalar unnamed logical. If you believe your use case is valid you can test it with 'options(datatable.with=2)', which might be default in future."))
} else if (verbose) {
cat("with[j]=FALSE used together with negation operator but the use case is valid: unary minus for numeric or bang for logical; not need to emit message\n")
}
j_ok
}
with_i = function(i, len, verbose) {
if (!is.numeric(i) && !is.logical(i)) stop("'i' must be numeric or logical when with=FALSE")
if (is.logical(i)) {
if (length(i)!=len) stop("'i' logical must be length of rows for with=FALSE, it is not recycled") ## because scalar TRUE has special meaning
i = which(i)
} else {
if (!is.integer(i)) i = as.integer(i)
i = .Call(CconvertNegAndZeroIdx, i, len, TRUE)
}
if (verbose)
cat("with[i]=FALSE optimization\n")
i
}
with_j = function(j, len, x, verbose) {
if (!is.numeric(j) && !is.logical(j) && !is.character(j)) stop("'j' must be numeric, logical or character when with=FALSE")
if (anyNA(j)) stop("'j' must be non-NA")
if (is.logical(j)) {
if (length(j)!=len) stop("'j' logical must be length of columns for with=FALSE, it is not recycled")
j = which(j)
} else if (is.character(j)) {
j = chmatch(j, names(x))
if (anyNA(j)) stop("'j' must be existing columns")
} else {
if (!is.integer(j)) j = as.integer(j)
j = .Call(CconvertNegAndZeroIdx, j, len, FALSE)
}
if (verbose)
cat("with[j]=FALSE optimization\n")
j
}
24 changes: 24 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -16853,3 +16853,27 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN))
test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
A = data.table(A=as.complex(rep(NA, 5)))
test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))

# `i` argument could get with=FALSE #4485
d = data.table(x=1:2, y=2:3, z=3:4)
options(datatable.verbose=TRUE)
test(2139.01, d[1L, 1L, with=F], data.table(x=1L), notOutput="with.*FALSE") ## not optimized for backward compatibility
test(2139.02, d[1L, 1L, with=c(j=F)], data.table(x=1L), output="with[j]=FALSE", notOutput="with[i]=FALSE")
test(2139.03, d[1L, 1L, with=c(T,F)], data.table(x=1L), output="with[j]=FALSE", notOutput="with[i]=FALSE")
test(2139.04, d[1L, 1L, with=c(F,T)], data.table(x=1L), output="with[i]=FALSE", notOutput="with[j]=FALSE")
test(2139.05, d[1L, 1L, with=c(T,T)], data.table(x=1L), notOutput="with.*FALSE")
test(2139.06, d[1L, 1L, with=c(F,F)], data.table(x=1L), output="with\\[i\\]=FALSE.*with\\[j\\]=FALSE.*with=FALSE short-circuit return")
test(2139.07, d[, -(2:3), with=c(j=F)], data.table(x=1:2), output="use case is valid")
test(2139.08, d[, !c(F,T,T), with=c(j=F)], data.table(x=1:2), output="use case is valid")
test(2139.81, d[, !(2:3), with=c(j=F)], error="change it to use unary '-'")
test(2139.82, d[, -c(F,T,T), with=c(j=F)], error="change it to use '!'")
test(2139.83, d[, !c("y","z"), with=c(j=F)], error="change it to use '!names(x) %in% j'")
test(2139.84, d[, -c("y","z"), with=c(j=F)], error="change it to use '!names(x) %in% j'")
test(2139.85, d[, !list("y","z"), with=c(j=F)], error="change it to be one of those")
test(2139.86, d[, -list("y","z"), with=c(j=F)], error="change it to be one of those")
options(datatable.with=2) # all cases below 'unexpected' answer
test(2139.91, d[, !(2:3), with=c(j=F)], error="logical must be length of columns")
test(2139.92, d[, !c(2:3,0L), with=c(j=F)], data.table(z=3:4))
test(2139.93, d[, -c(F,T,T), with=c(j=F)], data.table(y=2:3, z=3:4), warning="Ignoring this dup and 0 other dups")
test(2139.94, d[, !list("y","z"), with=c(j=F)], error="invalid argument type")
options(datatable.with=NULL)