From a52a1205a9c5b255f473f28e540cf340ea8ba60f Mon Sep 17 00:00:00 2001 From: Sam Firke Date: Tue, 24 Jan 2023 17:17:17 -0500 Subject: [PATCH] implement formatting function for adorn_ns (#515) * implement format_func from #444 * specify namespace for dplyr::desc --- NEWS.md | 7 ++++++ R/adorn_ns.R | 29 ++++++++++++++++++----- R/get_dupes.R | 2 +- man/adorn_ns.Rd | 30 ++++++++++++++++++++---- tests/testthat/test-adorn-ns.R | 43 ++++++++++++++++++++++++++++++++-- 5 files changed, 98 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 506589f3..7bbea432 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,11 @@ * `get_dupes()` results are now sorted first by descending order of `dupe_count`, then alphabetically by sorting variables. (#493) +* There are several minor breaking changes resulting from enhancements to `adorn_ns()`: + * The addition of the new argument `format_func` means that previous calls relying on `,,,` as shorthand to get to the `...` column selection argument may now require an extra comma + * `adorn_ns()` now defaults to displaying numbers of >3 digits with `big.mark = ","`, as part of the default value of the new `format_func` argument. E.g., `1234` is now `1,234`. + * `adorn_ns()` no longer prints leading whitespace when `position = "front"` - this is not a visible change in the printed result and it would be rare that this affects any code. + ## New features * `row_to_names()` now has a new helper function, `find_header()` to help find the row that contains the names. It can be used by passing `row_number="find_header"`, and see the documentation of `row_to_names()` and `find_header()` for more examples. (fix #429) @@ -24,6 +29,8 @@ * A new function `get_one_to_one()` has been added to find columns that map 1:1 to each other, even if the values within the columns differ (fix #291, **@billdenney**) +* `adorn_Ns()` contains a new `format_func` argument so that the user can format the Ns to their liking, e.g., changing the `big.mark` character. (#444) + * `clean_names()` can now be called on database connection in a dbplyr code pipeline (#467) ## Minor features diff --git a/R/adorn_ns.R b/R/adorn_ns.R index d202cbdb..4e62f7ca 100644 --- a/R/adorn_ns.R +++ b/R/adorn_ns.R @@ -5,7 +5,8 @@ #' #' @param dat a data.frame of class \code{tabyl} that has had \code{adorn_percentages} and/or \code{adorn_pct_formatting} called on it. If given a list of data.frames, this function will apply itself to each data.frame in the list (designed for 3-way \code{tabyl} lists). #' @param position should the N go in the front, or in the rear, of the percentage? -#' @param ns the Ns to append. The default is the "core" attribute of the input tabyl \code{dat}, where the original Ns of a two-way \code{tabyl} are stored. However, if you need to modify the numbers, e.g., to format \code{4000} as \code{4,000} or \code{4k}, you can do that separately and supply the formatted result here. +#' @param ns the Ns to append. The default is the "core" attribute of the input tabyl \code{dat}, where the original Ns of a two-way \code{tabyl} are stored. However, if your Ns are stored somewhere else, or you need to customize them beyond what can be done with `format_func`, you can supply them here. +#' @param format_func a formatting function to run on the Ns. Consider defining with \code{base::format()}. #' @param ... columns to adorn. This takes a tidyselect specification. By default, all columns are adorned except for the first column and columns not of class \code{numeric}, but this allows you to manually specify which columns should be adorned, for use on a data.frame that does not result from a call to \code{tabyl}. #' #' @return a data.frame with Ns appended @@ -18,6 +19,19 @@ #' adorn_pct_formatting() %>% #' adorn_ns(position = "front") #' +#' # Format the Ns with a custom format_func: +#' set.seed(1) +#' bigger_dat <- data.frame(sex = rep(c("m", "f"), 3000), +#' age = round(runif(3000, 1, 102), 0)) +#' bigger_dat$age_group = cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1/3, 2/3, 1))) +#' +#' bigger_dat %>% +#' tabyl(age_group, sex, show_missing_levels = FALSE) %>% +#' adorn_totals(c("row", "col")) %>% +#' adorn_percentages("col") %>% +#' adorn_pct_formatting(digits = 1) %>% +#' adorn_ns(format_func = function(x) format(x, big.mark = ".", decimal.mark = ",")) + #' # Control the columns to be adorned with the ... variable selection argument #' # If using only the ... argument, you can use empty commas as shorthand #' # to supply the default values to the preceding arguments: @@ -31,10 +45,10 @@ #' #'cases %>% #' adorn_percentages("col",,recovered:died) %>% -#' adorn_pct_formatting(,,,recovered:died) %>% -#' adorn_ns(,,recovered:died) +#' adorn_pct_formatting(,,,,,recovered:died) %>% +#' adorn_ns(,,,recovered:died) #' -adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), ...) { +adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), format_func = function(x) { format(x, big.mark = ",") }, ...) { # if input is a list, call purrr::map to recursively apply this function to each data.frame if (is.list(dat) && !is.data.frame(dat)) { purrr::map(dat, adorn_ns, position) # okay not to pass ns and allow for static Ns, b/c one size fits all for each list entry doesn't make sense for Ns. @@ -66,7 +80,7 @@ adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), ...) { if (custom_ns_supplied & !identical(dim(ns), dim(dat))) { # user-supplied Ns must include values for totals row/col if present stop("if supplying your own data.frame of Ns to append, its dimensions must match those of the data.frame in the \"dat\" argument") } - + # If appending the default Ns from the core, and there are totals rows/cols, append those values to the Ns table # Custom inputs to ns argument will need to calculate & format their own totals row/cols if (!custom_ns_supplied) { @@ -74,6 +88,9 @@ adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), ...) { ns <- adorn_totals(ns, attr(dat, "totals")) ns <- ns[order(match(ns[, 1], dat[, 1])), ] # from #407 - in rare event Totals row has been sorted off the bottom, sort to match } + numeric_cols <- which(vapply(ns, is.numeric, logical(1))) + ns[] <- lapply(ns, format_func) + ns[] <- lapply(ns, stringr::str_trim) } if (position == "rear") { @@ -92,7 +109,7 @@ adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), ...) { if(custom_ns_supplied & rlang::dots_n(...) == 0){ dont_adorn <- 1L } else if(rlang::dots_n(...) == 0){ - cols_to_adorn <- which(vapply(ns, is.numeric, logical(1))) # numeric cols + cols_to_adorn <- numeric_cols dont_adorn <- setdiff(1:ncol(dat), cols_to_adorn) dont_adorn <- unique(c(1, dont_adorn)) # always don't-append first column } else { diff --git a/R/get_dupes.R b/R/get_dupes.R index 98f355fb..56021a31 100644 --- a/R/get_dupes.R +++ b/R/get_dupes.R @@ -50,7 +50,7 @@ get_dupes <- function(dat, ...) { dplyr::add_count(!!! nms, name = "dupe_count") %>% dplyr::filter(dupe_count > 1) %>% dplyr::select(!!! nms, dupe_count, dplyr::everything()) %>% - dplyr::arrange(desc(dupe_count), !!! nms) + dplyr::arrange(dplyr::desc(dupe_count), !!! nms) # shorten error message for large data.frames if (length(var_names) > 10) { diff --git a/man/adorn_ns.Rd b/man/adorn_ns.Rd index 3c6c672e..e29e6d48 100644 --- a/man/adorn_ns.Rd +++ b/man/adorn_ns.Rd @@ -4,14 +4,24 @@ \alias{adorn_ns} \title{Add underlying Ns to a tabyl displaying percentages.} \usage{ -adorn_ns(dat, position = "rear", ns = attr(dat, "core"), ...) +adorn_ns( + dat, + position = "rear", + ns = attr(dat, "core"), + format_func = function(x) { + format(x, big.mark = ",") + }, + ... +) } \arguments{ \item{dat}{a data.frame of class \code{tabyl} that has had \code{adorn_percentages} and/or \code{adorn_pct_formatting} called on it. If given a list of data.frames, this function will apply itself to each data.frame in the list (designed for 3-way \code{tabyl} lists).} \item{position}{should the N go in the front, or in the rear, of the percentage?} -\item{ns}{the Ns to append. The default is the "core" attribute of the input tabyl \code{dat}, where the original Ns of a two-way \code{tabyl} are stored. However, if you need to modify the numbers, e.g., to format \code{4000} as \code{4,000} or \code{4k}, you can do that separately and supply the formatted result here.} +\item{ns}{the Ns to append. The default is the "core" attribute of the input tabyl \code{dat}, where the original Ns of a two-way \code{tabyl} are stored. However, if your Ns are stored somewhere else, or you need to customize them beyond what can be done with `format_func`, you can supply them here.} + +\item{format_func}{a formatting function to run on the Ns. Consider defining with \code{base::format()}.} \item{...}{columns to adorn. This takes a tidyselect specification. By default, all columns are adorned except for the first column and columns not of class \code{numeric}, but this allows you to manually specify which columns should be adorned, for use on a data.frame that does not result from a call to \code{tabyl}.} } @@ -29,6 +39,18 @@ mtcars \%>\% adorn_pct_formatting() \%>\% adorn_ns(position = "front") +# Format the Ns with a custom format_func: +set.seed(1) +bigger_dat <- data.frame(sex = rep(c("m", "f"), 3000), + age = round(runif(3000, 1, 102), 0)) +bigger_dat$age_group = cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1/3, 2/3, 1))) + +bigger_dat \%>\% + tabyl(age_group, sex, show_missing_levels = FALSE) \%>\% + adorn_totals(c("row", "col")) \%>\% + adorn_percentages("col") \%>\% + adorn_pct_formatting(digits = 1) \%>\% + adorn_ns(format_func = function(x) format(x, big.mark = ".", decimal.mark = ",")) # Control the columns to be adorned with the ... variable selection argument # If using only the ... argument, you can use empty commas as shorthand # to supply the default values to the preceding arguments: @@ -42,7 +64,7 @@ cases <- data.frame( cases \%>\% adorn_percentages("col",,recovered:died) \%>\% - adorn_pct_formatting(,,,recovered:died) \%>\% - adorn_ns(,,recovered:died) + adorn_pct_formatting(,,,,,recovered:died) \%>\% + adorn_ns(,,,recovered:died) } diff --git a/tests/testthat/test-adorn-ns.R b/tests/testthat/test-adorn-ns.R index e94a0b85..e6126a6e 100644 --- a/tests/testthat/test-adorn-ns.R +++ b/tests/testthat/test-adorn-ns.R @@ -33,8 +33,8 @@ test_that("front parameter works", { untabyl(), data.frame( x = c(letters[1:4], "Total"), - `0` = c("417 (82.7%)", " 2 (0.4%)", " 1 (0.2%)", " 0 (0.0%)", "420 (83.3%)"), - `1` = c("83 (16.5%)", " 0 (0.0%)", " 0 (0.0%)", " 1 (0.2%)", "84 (16.7%)"), + `0` = c("417 (82.7%)", "2 (0.4%)", "1 (0.2%)", "0 (0.0%)", "420 (83.3%)"), + `1` = c("83 (16.5%)", "0 (0.0%)", "0 (0.0%)", "1 (0.2%)", "84 (16.7%)"), check.names = FALSE, stringsAsFactors = FALSE ) @@ -198,3 +198,42 @@ test_that("adorn_ns works on single column data.frame with custom Ns if tidysele adorn_ns(ns = dplyr::select(attr(adorned_single, "core"), a = `4`),,,, a) expect_equal(stringr::str_sub(adorned_single$a, -4, -1), c(" (3)", " (8)")) }) + +# This tests the display of the decimal.mark by forcing a decimal into a tabyl +# Can't happen with a natural table, but maybe someone will use adorn_ns on a homespun data.frame +test_that("formatting function works, #444", { + set.seed(1) + bigger_dat <- data.frame(sex = rep(c("m", "f"), 3000), + age = round(runif(3000, 1, 102), 0)) + bigger_dat$age_group = cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1/3, 2/3, 1))) + + bigger_tab <- bigger_dat %>% + tabyl(age_group, sex, show_missing_levels = F) + + standard_output <- bigger_tab %>% + adorn_percentages("col") %>% + adorn_pct_formatting(digits = 1) %>% + adorn_ns(position = "front") + + # test commas in thousands place by default + expect_equal(standard_output$f, + c("1,018 (33.9%)", "990 (33.0%)", "980 (32.7%)", "12 (0.4%)") + ) + + # Test decimal mark + bigger_tab$f[1] <- 1018.5 # makes no sense in a tabyl but need for testing decimal mark display + + bigger_result <- bigger_tab %>% + untabyl() %>% # to get the decimal into the core + as_tabyl() %>% + adorn_totals(c("row", "col")) %>% + adorn_percentages("col") %>% + adorn_pct_formatting(digits = 1) %>% + adorn_ns(position = "rear", format_func = function(x) format(x, big.mark = ".", decimal.mark = ",")) + + expect_equal( + bigger_result$f, + c("33.9% (1.018,5)", "33.0% (990,0)", "32.7% (980,0)", "0.4% (12,0)", + "100.0% (3.000,5)") + ) +}) \ No newline at end of file