Skip to content

Commit

Permalink
on-disk cases denoted on benchplot, closes #126
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Dec 24, 2019
1 parent c750dfa commit c1b8a59
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 8 deletions.
9 changes: 6 additions & 3 deletions _benchplot/benchplot.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ format_exception = function(ex, s, d, q, which=c("data","query"), short=TRUE) {
}
ans
}
format_name_long = function(x, on_disk) fifelse(sapply(on_disk, isTRUE), paste0(x,"*"), x) # star suffix for on-disk cases #126
format_version = function(x) fifelse(is.na(x), "NA", as.character(x))
format_batch = function(x) fifelse(is.na(x), "NA", format(as.Date(as.POSIXct(as.numeric(x), origin="1970-01-01"))))
format_s_total_real_time_sec = function(data, solution, s_questions, s_total_real_time_sec, exceptions) {
Expand Down Expand Up @@ -221,23 +222,25 @@ header_legend = function(x, exceptions=list(), title.txt.fun=default.title.txt.f
dt = x[, .(data=unique1(data), version=unique1(version), batch=unique1(batch),
s_total_real_time_sec=unique1(s_total_real_time_sec), col_strong=unique1(col_strong),
name_short=unique1(name_short), name_long=unique1(name_long),
on_disk=unique1(on_disk), #126
s_questions=list(question)), ## retain all questions so can lookup for exceptions later on
keyby="solution"]
setorderv(dt, "s_total_real_time_sec", na.last=TRUE)
if (length(pending)) dt = rbindlist(list(
dt,
data.table(solution=NA_integer_, data=NA_integer_, version=NA_integer_, batch=NA_integer_, s_total_real_time_sec=NA_real_, col_strong="black", name_short=NA_character_, name_long=paste(pending, collapse=", "), s_questions=list())
data.table(solution=NA_integer_, data=NA_integer_, version=NA_integer_, batch=NA_integer_, s_total_real_time_sec=NA_real_, col_strong="black", name_short=NA_character_, name_long=paste(pending, collapse=", "), on_disk=FALSE, s_questions=list())
))
dt[!is.na(solution), `:=`(
format_name_long=format_name_long(name_long, on_disk),
format_version=format_version(version), format_batch=format_batch(batch),
format_s_total_real_time_sec = format_s_total_real_time_sec(data, solution, s_questions, s_total_real_time_sec, exceptions)
)]
dt[is.na(solution), `:=`(format_version="", format_batch="see README", format_s_total_real_time_sec="pending")]
dt[is.na(solution), `:=`(format_name_long=name_long, format_version="", format_batch="see README", format_s_total_real_time_sec="pending")]
dt[, "s_questions" := NULL]
dt[, legend(x_off[2L], xy[["y2"]], bty="n", cex=1.5,
pch=22, pt.bg=col_strong, pt.cex=3.5, ## color square
text.font=1, xpd=NA,
legend=name_long)] -> nul ## solution long name
legend=format_name_long)] -> nul ## solution long name
dt[, legend(x_off[20L], xy[["y2"]], bty="n", cex=1.5, text.font=1, xpd=NA,
legend=format_version)] -> nul ## version
dt[, legend(x_off[35L], xy[["y2"]], bty="n", cex=1.5, text.font=1, xpd=NA,
Expand Down
2 changes: 1 addition & 1 deletion _report/index.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ rpivotTable::rpivotTable(
- We ensure that calculations are not deferred by solution.
- We also tested that answers produced from different solutions match each others, for details see [_utils/answers-validation.R](https://github.com/h2oai/db-benchmark/blob/master/_utils/answers-validation.R).
- ClickHouse queries were made against `mergetree` table engine, see [#91](https://github.com/h2oai/db-benchmark/issues/91) for details.
- Other solutions are using in-memory data storage to achieve best timing. In case a solution runs out of memory (we use 125 GB machine), it will use on-disk data storage if possible (and if implemented, see [#126](https://github.com/h2oai/db-benchmark/issues/126) for status).
- Other solutions are using in-memory data storage to achieve best timing. In case a solution runs out of memory (we use 125 GB machine), it will use on-disk data storage if possible (and if implemented, see [#126](https://github.com/h2oai/db-benchmark/issues/126) for status). Then solution name is denoted by a `*` suffix.

## Environment configuration

Expand Down
15 changes: 11 additions & 4 deletions _report/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ clean_time = function(d) {
stop("timings data contains NA or '' as version field, that should not happen")
old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6")
d[!nzchar(git), git := NA_character_
][,"on_disk" := as.logical(on_disk)
][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_
][task=="groupby" & solution=="dask" & batch<1558106628 & question%in%c("max v1 - min v2 by id2 id4","regression v1 v2 by id2 id4"), c("out_rows","out_cols","chk") := .(NA_integer_, NA_integer_, NA_character_)
][task=="groupby" & solution=="pandas" & batch<=1558106628 & question=="largest two v3 by id2 id4", "out_cols" := NA_integer_
Expand Down Expand Up @@ -83,7 +84,9 @@ model_time = function(d) {
#d[,.SD][!is.na(out_rows), `:=`(unq_out_rows=uniqueN(out_rows), paste_unq_out_rows=paste(unique(out_rows), collapse=",")), .(task, data, question)][unq_out_rows>1, .(paste_unq_out_rows), .(task, solution, data, question)]
if (nrow(d[!is.na(out_cols), .(unqn_out_cols=uniqueN(out_cols)), .(task, data, question)][unqn_out_cols>1L]))
stop("Value of 'out_cols' varies for different runs for single question")
d = dcast(d, nodename+batch+in_rows+question+solution+fun+cache+version+git+task+data ~ run, value.var=c("timestamp","time_sec","mem_gb","chk_time_sec","chk","out_rows","out_cols"))
if (nrow(d[, .(unqn_on_disk=uniqueN(on_disk)), .(task, solution, data, batch)][unqn_on_disk>1L]))
stop("Value of 'on_disk' varies for different questions/runs for single solution+data+batch") # on_disk should be const in script
d = dcast(d, nodename+batch+in_rows+question+solution+fun+on_disk+cache+version+git+task+data ~ run, value.var=c("timestamp","time_sec","mem_gb","chk_time_sec","chk","out_rows","out_cols"))
d[, c("chk_2","out_rows_2","out_cols_2") := NULL]
setnames(d, c("chk_1","out_rows_1","out_cols_1"), c("chk","out_rows","out_cols"))
d
Expand Down Expand Up @@ -153,14 +156,18 @@ transform = function(ld) {
ld[, max_batch:=max(batch), c("solution","task","data")]
ld[, script_recent:=FALSE][batch==max_batch, script_recent:=TRUE][, max_batch:=NULL]
ld[, "na_time_sec":=FALSE][is.na(time_sec_1) | is.na(time_sec_2), "na_time_sec":=TRUE]

ld[, "on_disk" := on_disk[1L], by=c("batch","solution","task","data")] # on_disk is a constant across whole script, fill trailing NA so advanced question group will not stay NA if basic had that info #126

{ # clickhouse memory/mergetree table engine handling
ld[, "engine":=NA_character_]
ld[task=="groupby" & solution=="clickhouse" & substr(data, 1L, 2L)=="G1", engine:="memory"]
ld[task=="groupby" & solution=="clickhouse" & substr(data, 1L, 2L)=="G2", engine:="mergetree"]
## according to #91 we now will present mergetree only
ld = ld[!(task=="groupby" & solution=="clickhouse" & engine=="memory")]
ld[task=="groupby" & solution=="clickhouse" & engine=="mergetree", data:=gsub("G2", "G1", data, fixed=TRUE)]
ld[task=="groupby" & solution=="clickhouse" & engine=="mergetree", `:=`(
data = gsub("G2", "G1", data, fixed=TRUE),
on_disk = !on_disk ## swap to denote slower method with star suffix, so for clickhouse it is (currently unused) memory table engine, otherwise clickhouse would always be marked by star #126
)]
#if (nrow(ld[task=="groupby" & solution=="clickhouse" & engine=="memory" & na_time_sec==TRUE])) {
# ld[task=="groupby" & solution=="clickhouse" & engine=="mergetree"
# ][, `:=`(
Expand All @@ -181,7 +188,7 @@ transform = function(ld) {
}

ld[, c(list(nodename=nodename, batch=batch, ibatch=as.integer(ft(as.character(batch))), solution=solution,
question=question, question_group=question_group, fun=fun, version=version, git=git, task=task, data=data, engine=engine),
question=question, question_group=question_group, fun=fun, on_disk=on_disk, cache=cache, version=version, git=git, task=task, data=data, engine=engine),
ftdata(data, task=as.character(task)), .SD),
.SDcols=c(paste(rep(c("timestamp","time_sec","mem_gb","chk_time_sec"), each=2), 1:2, sep="_"),
paste("script", c("finish","start","stderr","recent"), sep="_"),
Expand Down

0 comments on commit c1b8a59

Please sign in to comment.