-
Notifications
You must be signed in to change notification settings - Fork 110
/
tbl_custom_summary.Rd
259 lines (224 loc) · 9.63 KB
/
tbl_custom_summary.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tbl_custom_summary.R
\name{tbl_custom_summary}
\alias{tbl_custom_summary}
\title{Create a table of summary statistics using a custom summary function}
\usage{
tbl_custom_summary(
data,
by = NULL,
label = NULL,
stat_fns,
statistic,
digits = NULL,
type = NULL,
value = NULL,
missing = c("ifany", "no", "always"),
missing_text = "Unknown",
missing_stat = "{N_miss}",
include = everything(),
overall_row = FALSE,
overall_row_last = FALSE,
overall_row_label = "Overall"
)
}
\arguments{
\item{data}{(\code{data.frame})\cr A data frame.}
\item{by}{(\code{\link[dplyr:dplyr_tidy_select]{tidy-select}})\cr
A single column from \code{data}. Summary statistics will be stratified by this variable.
Default is \code{NULL}.}
\item{label}{(\code{\link[=syntax]{formula-list-selector}})\cr
Used to override default labels in summary table, e.g. \code{list(age = "Age, years")}.
The default for each variable is the column label attribute, \code{attr(., 'label')}.
If no label has been set, the column name is used.}
\item{stat_fns}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies the function to be used to compute the statistics
(see below for details and examples).
You can also use dedicated helpers such as \code{\link[=ratio_summary]{ratio_summary()}}
or \code{\link[=proportion_summary]{proportion_summary()}}.}
\item{statistic}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies summary statistics to display for each variable. The default is
\code{list(all_continuous() ~ "{median} ({p25}, {p75})", all_categorical() ~ "{n} ({p}\%)")}.
See below for details.}
\item{digits}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies how summary statistics are rounded. Values may be either integer(s)
or function(s). If not specified, default formatting is assigned
via \code{assign_summary_digits()}. See below for details.}
\item{type}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies the summary type. Accepted value are
\code{c("continuous", "continuous2", "categorical", "dichotomous")}.
If not specified, default type is assigned via
\code{assign_summary_type()}. See below for details.}
\item{value}{(\code{\link[=syntax]{formula-list-selector}})\cr
Specifies the level of a variable to display on a single row.
The gtsummary type selectors, e.g. \code{all_dichotomous()}, cannot be used
with this argument. Default is \code{NULL}. See below for details.}
\item{missing, missing_text, missing_stat}{Arguments dictating how and if missing values are presented:
\itemize{
\item \code{missing}: must be one of \code{c("ifany", "no", "always")}
\item \code{missing_text}: string indicating text shown on missing row. Default is \code{"Unknown"}
\item \code{missing_stat}: statistic to show on missing row. Default is \code{"{N_miss}"}.
Possible values are \code{N_miss}, \code{N_obs}, \code{N_nonmiss}, \code{p_miss}, \code{p_nonmiss}.
}}
\item{include}{(\code{\link[dplyr:dplyr_tidy_select]{tidy-select}})\cr
Variables to include in the summary table. Default is \code{everything()}.}
\item{overall_row}{(scalar \code{logical})\cr
Logical indicator to display an overall row. Default is
\code{FALSE}. Use \code{\link[=add_overall]{add_overall()}} to add an overall column.}
\item{overall_row_last}{(scalar \code{logical})\cr
Logical indicator to display overall row last in
table. Default is \code{FALSE}, which will display overall row first.}
\item{overall_row_label}{(\code{string})\cr
String indicating the overall row label. Default is \code{"Overall"}.}
}
\value{
A \code{tbl_custom_summary} object
}
\description{
\lifecycle{experimental}\cr
The \code{tbl_custom_summary()} function calculates descriptive statistics for
continuous, categorical, and dichotomous variables.
This function is similar to \code{\link[=tbl_summary]{tbl_summary()}} but allows you to provide
a custom function in charge of computing the statistics (see Details).
}
\section{Similarities with \code{tbl_summary()}}{
Please refer to the help file of \code{\link[=tbl_summary]{tbl_summary()}} regarding the use of select
helpers, and arguments \code{include}, \code{by}, \code{type}, \code{value}, \code{digits}, \code{missing} and
\code{missing_text}.
}
\section{\code{stat_fns} argument}{
The \code{stat_fns} argument specify the custom function(s) to be used for computing
the summary statistics. For example, \code{stat_fns = everything() ~ foo}.
Each function may take the following arguments:
\code{foo(data, full_data, variable, by, type, ...)}
\itemize{
\item \verb{data=} is the input data frame passed to \code{tbl_custom_summary()}, subset
according to the level of \code{by} or \code{variable} if any, excluding \code{NA}
values of the current \code{variable}
\item \verb{full_data=} is the full input data frame passed to \code{tbl_custom_summary()}
\item \verb{variable=} is a string indicating the variable to perform the
calculation on
\item \verb{by=} is a string indicating the by variable from \verb{tbl_custom_summary=},
if present
\item \verb{type=} is a string indicating the type of variable
(continuous, categorical, ...)
\item \verb{stat_display=} a string indicating the statistic to display (for the
\code{statistic} argument, for that variable)
}
The user-defined does not need to utilize each of these inputs. It's
encouraged the user-defined function accept \code{...} as each of the arguments
\emph{will} be passed to the function, even if not all inputs are utilized by
the user's function, e.g. \code{foo(data, ...)} (see examples).
The user-defined function should return a one row \code{\link[dplyr:reexports]{dplyr::tibble()}} with
one column per summary statistics (see examples).
}
\section{statistic argument}{
The statistic argument specifies the statistics presented in the table. The
input is a list of formulas that specify the statistics to report. For example,
\code{statistic = list(age ~ "{mean} ({sd})")}.
A statistic name that appears between curly brackets
will be replaced with the numeric statistic (see \code{\link[glue:glue]{glue::glue()}}).
All the statistics indicated in the statistic argument should be returned
by the functions defined in the \code{stat_fns} argument.
When the summary type is \code{"continuous2"}, pass a vector of statistics. Each element
of the vector will result in a separate row in the summary table.
For both categorical and continuous variables, statistics on the number of
missing and non-missing observations and their proportions are also available
to display.
\itemize{
\item \code{{N_obs}} total number of observations
\item \code{{N_miss}} number of missing observations
\item \code{{N_nonmiss}} number of non-missing observations
\item \code{{p_miss}} percentage of observations missing
\item \code{{p_nonmiss}} percentage of observations not missing
}
Note that for categorical variables, \code{{N_obs}}, \code{{N_miss}} and \code{{N_nonmiss}} refer
to the total number, number missing and number non missing observations
in the denominator, not at each level of the categorical variable.
It is recommended to use \code{\link[=modify_footnote]{modify_footnote()}} to properly describe the
displayed statistics (see examples).
}
\section{Caution}{
The returned table is compatible with all \code{gtsummary} features applicable
to a \code{tbl_summary} object, like \code{\link[=add_overall]{add_overall()}}, \code{\link[=modify_footnote]{modify_footnote()}} or
\code{\link[=bold_labels]{bold_labels()}}.
However, some of them could be inappropriate in such case. In particular,
\code{\link[=add_p]{add_p()}} do not take into account the type of displayed statistics and
always return the p-value of a comparison test of the current variable
according to the \code{by} groups, which may be incorrect if the displayed
statistics refer to a third variable.
}
\examples{
# Example 1 ----------------------------------
my_stats <- function(data, ...) {
marker_sum <- sum(data$marker, na.rm = TRUE)
mean_age <- mean(data$age, na.rm = TRUE)
dplyr::tibble(
marker_sum = marker_sum,
mean_age = mean_age
)
}
my_stats(trial)
trial |>
tbl_custom_summary(
include = c("stage", "grade"),
by = "trt",
stat_fns = everything() ~ my_stats,
statistic = everything() ~ "A: {mean_age} - S: {marker_sum}",
digits = everything() ~ c(1, 0),
overall_row = TRUE,
overall_row_label = "All stages & grades"
) |>
add_overall(last = TRUE) |>
modify_footnote(
all_stat_cols() ~ "A: mean age - S: sum of marker"
) |>
bold_labels()
# Example 2 ----------------------------------
# Use `data[[variable]]` to access the current variable
mean_ci <- function(data, variable, ...) {
test <- t.test(data[[variable]])
dplyr::tibble(
mean = test$estimate,
conf.low = test$conf.int[1],
conf.high = test$conf.int[2]
)
}
trial |>
tbl_custom_summary(
include = c("marker", "ttdeath"),
by = "trt",
stat_fns = ~ mean_ci,
statistic = ~ "{mean} [{conf.low}; {conf.high}]"
) |>
add_overall(last = TRUE) |>
modify_footnote(
all_stat_cols() ~ "mean [95\% CI]"
)
# Example 3 ----------------------------------
# Use `full_data` to access the full datasets
# Returned statistic can also be a character
diff_to_great_mean <- function(data, full_data, ...) {
mean <- mean(data$marker, na.rm = TRUE)
great_mean <- mean(full_data$marker, na.rm = TRUE)
diff <- mean - great_mean
dplyr::tibble(
mean = mean,
great_mean = great_mean,
diff = diff,
level = ifelse(diff > 0, "high", "low")
)
}
trial |>
tbl_custom_summary(
include = c("grade", "stage"),
by = "trt",
stat_fns = ~ diff_to_great_mean,
statistic = ~ "{mean} ({level}, diff: {diff})",
overall_row = TRUE
) |>
bold_labels()
}
\author{
Joseph Larmarange
}