man/tbl_custom_summary.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tbl_custom_summary.R
\name{tbl_custom_summary}
\alias{tbl_custom_summary}
\title{Create a table of summary statistics using a custom summary function}
\usage{
tbl_custom_summary(
  data,
  by = NULL,
  label = NULL,
  stat_fns,
  statistic,
  digits = NULL,
  type = NULL,
  value = NULL,
  missing = c("ifany", "no", "always"),
  missing_text = "Unknown",
  missing_stat = "{N_miss}",
  include = everything(),
  overall_row = FALSE,
  overall_row_last = FALSE,
  overall_row_label = "Overall"
)
}
\arguments{
\item{data}{(\code{data.frame})\cr A data frame.}

\item{by}{(\code{\link[dplyr:dplyr_tidy_select]{tidy-select}})\cr
A single column from \code{data}. Summary statistics will be stratified by this variable.
Default is \code{NULL}.}

\item{label}{(\code{\link[=syntax]{formula-list-selector}})\cr
Used to override default labels in summary table, e.g. \code{list(age = "Age, years")}.
The default for each variable is the column label attribute, \code{attr(., 'label')}.
If no label has been set, the column name is used.}

\item{stat_fns}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies the function to be used to compute the statistics
(see below for details and examples).
You can also use dedicated helpers such as \code{\link[=ratio_summary]{ratio_summary()}}
or \code{\link[=proportion_summary]{proportion_summary()}}.}

\item{statistic}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies summary statistics to display for each variable.  The default is
\code{list(all_continuous() ~ "{median} ({p25}, {p75})", all_categorical() ~ "{n} ({p}\%)")}.
See below for details.}

\item{digits}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies how summary statistics are rounded. Values may be either integer(s)
or function(s). If not specified, default formatting is assigned
via \code{assign_summary_digits()}. See below for details.}

\item{type}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies the summary type. Accepted value are
\code{c("continuous", "continuous2", "categorical", "dichotomous")}.
If not specified, default type is assigned via
\code{assign_summary_type()}. See below for details.}

\item{value}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies the level of a variable to display on a single row.
The gtsummary type selectors, e.g. \code{all_dichotomous()}, cannot be used
with this argument. Default is \code{NULL}. See below for details.}

\item{missing, missing_text, missing_stat}{Arguments dictating how and if missing values are presented:
\itemize{
\item \code{missing}: must be one of \code{c("ifany", "no", "always")}
\item \code{missing_text}: string indicating text shown on missing row. Default is \code{"Unknown"}
\item \code{missing_stat}: statistic to show on missing row. Default is \code{"{N_miss}"}.
Possible values are \code{N_miss}, \code{N_obs}, \code{N_nonmiss}, \code{p_miss}, \code{p_nonmiss}.
}}

\item{include}{(\code{\link[dplyr:dplyr_tidy_select]{tidy-select}})\cr
Variables to include in the summary table. Default is \code{everything()}.}

\item{overall_row}{(scalar \code{logical})\cr
Logical indicator to display an overall row. Default is
\code{FALSE}. Use \code{\link[=add_overall]{add_overall()}} to add an overall column.}

\item{overall_row_last}{(scalar \code{logical})\cr
Logical indicator to display overall row last in
table. Default is \code{FALSE}, which will display overall row first.}

\item{overall_row_label}{(\code{string})\cr
String indicating the overall row label. Default is \code{"Overall"}.}
}
\value{
A \code{tbl_custom_summary} object
}
\description{
\lifecycle{experimental}\cr
The \code{tbl_custom_summary()} function calculates descriptive statistics for
continuous, categorical, and dichotomous variables.
This function is similar to \code{\link[=tbl_summary]{tbl_summary()}} but allows you to provide
a custom function in charge of computing the statistics (see Details).
}
\section{Similarities with \code{tbl_summary()}}{

Please refer to the help file of \code{\link[=tbl_summary]{tbl_summary()}} regarding the use of select
helpers, and arguments \code{include}, \code{by}, \code{type}, \code{value}, \code{digits}, \code{missing} and
\code{missing_text}.
}

\section{\code{stat_fns} argument}{

The \code{stat_fns} argument specify the custom function(s) to be used for computing
the summary statistics. For example, \code{stat_fns = everything() ~ foo}.

Each function may take the following arguments:
\code{foo(data, full_data, variable, by, type, ...)}
\itemize{
\item \verb{data=} is the input data frame passed to \code{tbl_custom_summary()}, subset
according to the level of \code{by} or \code{variable} if any, excluding \code{NA}
values of the current \code{variable}
\item \verb{full_data=} is the full input data frame passed to \code{tbl_custom_summary()}
\item \verb{variable=} is a string indicating the variable to perform the
calculation on
\item \verb{by=} is a string indicating the by variable from \verb{tbl_custom_summary=},
if present
\item \verb{type=} is a string indicating the type of variable
(continuous, categorical, ...)
\item \verb{stat_display=} a string indicating the statistic to display (for the
\code{statistic} argument, for that variable)
}

The user-defined does not need to utilize each of these inputs. It's
encouraged the user-defined function accept \code{...} as each of the arguments
\emph{will} be passed to the function, even if not all inputs are utilized by
the user's function, e.g. \code{foo(data, ...)} (see examples).

The user-defined function should return a one row \code{\link[dplyr:reexports]{dplyr::tibble()}} with
one column per summary statistics (see examples).
}

\section{statistic argument}{

The statistic argument specifies the statistics presented in the table. The
input is a list of formulas that specify the statistics to report. For example,
\code{statistic = list(age ~ "{mean} ({sd})")}.
A statistic name that appears between curly brackets
will be replaced with the numeric statistic (see \code{\link[glue:glue]{glue::glue()}}).
All the statistics indicated in the statistic argument should be returned
by the functions defined in the \code{stat_fns} argument.

When the summary type is \code{"continuous2"}, pass a vector of statistics. Each element
of the vector will result in a separate row in the summary table.

For both categorical and continuous variables, statistics on the number of
missing and non-missing observations and their proportions are also available
to display.
\itemize{
\item \code{{N_obs}} total number of observations
\item \code{{N_miss}} number of missing observations
\item \code{{N_nonmiss}} number of non-missing observations
\item \code{{p_miss}} percentage of observations missing
\item \code{{p_nonmiss}} percentage of observations not missing
}

Note that for categorical variables, \code{{N_obs}}, \code{{N_miss}} and \code{{N_nonmiss}} refer
to the total number, number missing and number non missing observations
in the denominator, not at each level of the categorical variable.

It is recommended to use \code{\link[=modify_footnote]{modify_footnote()}} to properly describe the
displayed statistics (see examples).
}

\section{Caution}{


The returned table is compatible with all \code{gtsummary} features applicable
to a \code{tbl_summary} object, like \code{\link[=add_overall]{add_overall()}}, \code{\link[=modify_footnote]{modify_footnote()}} or
\code{\link[=bold_labels]{bold_labels()}}.

However, some of them could be inappropriate in such case. In particular,
\code{\link[=add_p]{add_p()}} do not take into account the type of displayed statistics and
always return the p-value of a comparison test of the current variable
according to the \code{by} groups, which may be incorrect if the displayed
statistics refer to a third variable.
}

\examples{
# Example 1 ----------------------------------
my_stats <- function(data, ...) {
  marker_sum <- sum(data$marker, na.rm = TRUE)
  mean_age <- mean(data$age, na.rm = TRUE)
  dplyr::tibble(
    marker_sum = marker_sum,
    mean_age = mean_age
  )
}

my_stats(trial)

trial |>
  tbl_custom_summary(
    include = c("stage", "grade"),
    by = "trt",
    stat_fns = everything() ~ my_stats,
    statistic = everything() ~ "A: {mean_age} - S: {marker_sum}",
    digits = everything() ~ c(1, 0),
    overall_row = TRUE,
    overall_row_label = "All stages & grades"
  ) |>
  add_overall(last = TRUE) |>
  modify_footnote(
    all_stat_cols() ~ "A: mean age - S: sum of marker"
  ) |>
  bold_labels()

# Example 2 ----------------------------------
# Use `data[[variable]]` to access the current variable
mean_ci <- function(data, variable, ...) {
  test <- t.test(data[[variable]])
  dplyr::tibble(
    mean = test$estimate,
    conf.low = test$conf.int[1],
    conf.high = test$conf.int[2]
  )
}

trial |>
  tbl_custom_summary(
    include = c("marker", "ttdeath"),
    by = "trt",
    stat_fns = ~ mean_ci,
    statistic = ~ "{mean} [{conf.low}; {conf.high}]"
  ) |>
  add_overall(last = TRUE) |>
  modify_footnote(
    all_stat_cols() ~ "mean [95\% CI]"
  )

# Example 3 ----------------------------------
# Use `full_data` to access the full datasets
# Returned statistic can also be a character
diff_to_great_mean <- function(data, full_data, ...) {
  mean <- mean(data$marker, na.rm = TRUE)
  great_mean <- mean(full_data$marker, na.rm = TRUE)
  diff <- mean - great_mean
  dplyr::tibble(
    mean = mean,
    great_mean = great_mean,
    diff = diff,
    level = ifelse(diff > 0, "high", "low")
  )
}

trial |>
  tbl_custom_summary(
    include = c("grade", "stage"),
    by = "trt",
    stat_fns = ~ diff_to_great_mean,
    statistic = ~ "{mean} ({level}, diff: {diff})",
    overall_row = TRUE
  ) |>
  bold_labels()
}
\author{
Joseph Larmarange
}