From a52a1205a9c5b255f473f28e540cf340ea8ba60f Mon Sep 17 00:00:00 2001
From: Sam Firke <sfirke@users.noreply.github.com>
Date: Tue, 24 Jan 2023 17:17:17 -0500
Subject: [PATCH] implement formatting function for adorn_ns (#515)

* implement format_func from #444

* specify namespace for dplyr::desc
---
 NEWS.md                        |  7 ++++++
 R/adorn_ns.R                   | 29 ++++++++++++++++++-----
 R/get_dupes.R                  |  2 +-
 man/adorn_ns.Rd                | 30 ++++++++++++++++++++----
 tests/testthat/test-adorn-ns.R | 43 ++++++++++++++++++++++++++++++++--
 5 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 506589f3..7bbea432 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,6 +12,11 @@
 
 * `get_dupes()` results are now sorted first by descending order of `dupe_count`, then alphabetically by sorting variables. (#493)
 
+* There are several minor breaking changes resulting from enhancements to `adorn_ns()`:
+  * The addition of the new argument `format_func` means that previous calls relying on `,,,` as shorthand to get to the `...` column selection argument may now require an extra comma
+  * `adorn_ns()` now defaults to displaying numbers of >3 digits with `big.mark = ","`, as part of the default value of the new `format_func` argument.  E.g., `1234`  is now `1,234`.
+  * `adorn_ns()` no longer prints leading whitespace when `position = "front"` - this is not a visible change in the printed result and it would be rare that this affects any code.
+
 ## New features
 
 * `row_to_names()` now has a new helper function, `find_header()` to help find the row that contains the names.  It can be used by passing `row_number="find_header"`, and see the documentation of `row_to_names()` and `find_header()` for more examples. (fix #429)
@@ -24,6 +29,8 @@
 
 * A new function `get_one_to_one()` has been added to find columns that map 1:1 to each other, even if the values within the columns differ (fix #291, **@billdenney**)
 
+* `adorn_Ns()` contains a new `format_func` argument so that the user can format the Ns to their liking, e.g., changing the `big.mark` character. (#444)
+
 * `clean_names()` can now be called on database connection in a dbplyr code pipeline (#467)
 
 ## Minor features
diff --git a/R/adorn_ns.R b/R/adorn_ns.R
index d202cbdb..4e62f7ca 100644
--- a/R/adorn_ns.R
+++ b/R/adorn_ns.R
@@ -5,7 +5,8 @@
 #'
 #' @param dat a data.frame of class \code{tabyl} that has had \code{adorn_percentages} and/or \code{adorn_pct_formatting} called on it.  If given a list of data.frames, this function will apply itself to each data.frame in the list (designed for 3-way \code{tabyl} lists).
 #' @param position should the N go in the front, or in the rear, of the percentage?
-#' @param ns the Ns to append.  The default is the "core" attribute of the input tabyl \code{dat}, where the original Ns of a two-way \code{tabyl} are stored.  However, if you need to modify the numbers, e.g., to format \code{4000} as \code{4,000} or \code{4k}, you can do that separately and supply the formatted result here.
+#' @param ns the Ns to append.  The default is the "core" attribute of the input tabyl \code{dat}, where the original Ns of a two-way \code{tabyl} are stored.  However, if your Ns are stored somewhere else, or you need to customize them beyond what can be done with `format_func`, you can supply them here.
+#' @param format_func a formatting function to run on the Ns.  Consider defining with \code{base::format()}. 
 #' @param ... columns to adorn.  This takes a tidyselect specification.  By default, all columns are adorned except for the first column and columns not of class \code{numeric}, but this allows you to manually specify which columns should be adorned, for use on a data.frame that does not result from a call to \code{tabyl}. 
 #'
 #' @return a data.frame with Ns appended
@@ -18,6 +19,19 @@
 #'   adorn_pct_formatting() %>%
 #'   adorn_ns(position = "front")
 #'   
+#' # Format the Ns with a custom format_func:
+#' set.seed(1)
+#' bigger_dat <- data.frame(sex = rep(c("m", "f"), 3000),
+#'                          age = round(runif(3000, 1, 102), 0))
+#' bigger_dat$age_group = cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1/3, 2/3, 1)))
+#' 
+#' bigger_dat %>%
+#'   tabyl(age_group, sex, show_missing_levels = FALSE) %>%
+#'   adorn_totals(c("row", "col")) %>%
+#'   adorn_percentages("col") %>%
+#'   adorn_pct_formatting(digits = 1) %>% 
+#'   adorn_ns(format_func = function(x) format(x, big.mark = ".", decimal.mark = ","))
+
 #' # Control the columns to be adorned with the ... variable selection argument
 #' # If using only the ... argument, you can use empty commas as shorthand 
 #' # to supply the default values to the preceding arguments:
@@ -31,10 +45,10 @@
 #' 
 #'cases %>%
 #'  adorn_percentages("col",,recovered:died) %>%
-#'  adorn_pct_formatting(,,,recovered:died) %>%
-#'  adorn_ns(,,recovered:died)
+#'  adorn_pct_formatting(,,,,,recovered:died) %>%
+#'  adorn_ns(,,,recovered:died)
 #'   
-adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), ...) {
+adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), format_func = function(x) { format(x, big.mark = ",") }, ...) {
   # if input is a list, call purrr::map to recursively apply this function to each data.frame
   if (is.list(dat) && !is.data.frame(dat)) {
     purrr::map(dat, adorn_ns, position) # okay not to pass ns and allow for static Ns, b/c one size fits all for each list entry doesn't make sense for Ns.
@@ -66,7 +80,7 @@ adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), ...) {
     if (custom_ns_supplied & !identical(dim(ns), dim(dat))) { # user-supplied Ns must include values for totals row/col if present
       stop("if supplying your own data.frame of Ns to append, its dimensions must match those of the data.frame in the \"dat\" argument")
     }
-
+    
     # If appending the default Ns from the core, and there are totals rows/cols, append those values to the Ns table
     # Custom inputs to ns argument will need to calculate & format their own totals row/cols
     if (!custom_ns_supplied) {
@@ -74,6 +88,9 @@ adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), ...) {
         ns <- adorn_totals(ns, attr(dat, "totals"))
         ns <- ns[order(match(ns[, 1], dat[, 1])), ] # from #407 - in rare event Totals row has been sorted off the bottom, sort to match
       }
+      numeric_cols <- which(vapply(ns, is.numeric, logical(1)))
+      ns[] <- lapply(ns, format_func)
+      ns[] <- lapply(ns, stringr::str_trim)
     }
     
     if (position == "rear") {
@@ -92,7 +109,7 @@ adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), ...) {
     if(custom_ns_supplied & rlang::dots_n(...) == 0){
       dont_adorn <- 1L
     } else if(rlang::dots_n(...) == 0){
-      cols_to_adorn <- which(vapply(ns, is.numeric, logical(1))) # numeric cols
+      cols_to_adorn <- numeric_cols
       dont_adorn <- setdiff(1:ncol(dat), cols_to_adorn)
       dont_adorn <- unique(c(1, dont_adorn)) # always don't-append first column
     } else {
diff --git a/R/get_dupes.R b/R/get_dupes.R
index 98f355fb..56021a31 100644
--- a/R/get_dupes.R
+++ b/R/get_dupes.R
@@ -50,7 +50,7 @@ get_dupes <- function(dat, ...) {
     dplyr::add_count(!!! nms, name = "dupe_count") %>%
     dplyr::filter(dupe_count > 1) %>%
     dplyr::select(!!! nms, dupe_count, dplyr::everything()) %>%
-    dplyr::arrange(desc(dupe_count), !!! nms)
+    dplyr::arrange(dplyr::desc(dupe_count), !!! nms)
   
   # shorten error message for large data.frames
   if (length(var_names) > 10) {
diff --git a/man/adorn_ns.Rd b/man/adorn_ns.Rd
index 3c6c672e..e29e6d48 100644
--- a/man/adorn_ns.Rd
+++ b/man/adorn_ns.Rd
@@ -4,14 +4,24 @@
 \alias{adorn_ns}
 \title{Add underlying Ns to a tabyl displaying percentages.}
 \usage{
-adorn_ns(dat, position = "rear", ns = attr(dat, "core"), ...)
+adorn_ns(
+  dat,
+  position = "rear",
+  ns = attr(dat, "core"),
+  format_func = function(x) {
+     format(x, big.mark = ",")
+ },
+  ...
+)
 }
 \arguments{
 \item{dat}{a data.frame of class \code{tabyl} that has had \code{adorn_percentages} and/or \code{adorn_pct_formatting} called on it.  If given a list of data.frames, this function will apply itself to each data.frame in the list (designed for 3-way \code{tabyl} lists).}
 
 \item{position}{should the N go in the front, or in the rear, of the percentage?}
 
-\item{ns}{the Ns to append.  The default is the "core" attribute of the input tabyl \code{dat}, where the original Ns of a two-way \code{tabyl} are stored.  However, if you need to modify the numbers, e.g., to format \code{4000} as \code{4,000} or \code{4k}, you can do that separately and supply the formatted result here.}
+\item{ns}{the Ns to append.  The default is the "core" attribute of the input tabyl \code{dat}, where the original Ns of a two-way \code{tabyl} are stored.  However, if your Ns are stored somewhere else, or you need to customize them beyond what can be done with `format_func`, you can supply them here.}
+
+\item{format_func}{a formatting function to run on the Ns.  Consider defining with \code{base::format()}.}
 
 \item{...}{columns to adorn.  This takes a tidyselect specification.  By default, all columns are adorned except for the first column and columns not of class \code{numeric}, but this allows you to manually specify which columns should be adorned, for use on a data.frame that does not result from a call to \code{tabyl}.}
 }
@@ -29,6 +39,18 @@ mtcars \%>\%
   adorn_pct_formatting() \%>\%
   adorn_ns(position = "front")
   
+# Format the Ns with a custom format_func:
+set.seed(1)
+bigger_dat <- data.frame(sex = rep(c("m", "f"), 3000),
+                         age = round(runif(3000, 1, 102), 0))
+bigger_dat$age_group = cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1/3, 2/3, 1)))
+
+bigger_dat \%>\%
+  tabyl(age_group, sex, show_missing_levels = FALSE) \%>\%
+  adorn_totals(c("row", "col")) \%>\%
+  adorn_percentages("col") \%>\%
+  adorn_pct_formatting(digits = 1) \%>\% 
+  adorn_ns(format_func = function(x) format(x, big.mark = ".", decimal.mark = ","))
 # Control the columns to be adorned with the ... variable selection argument
 # If using only the ... argument, you can use empty commas as shorthand 
 # to supply the default values to the preceding arguments:
@@ -42,7 +64,7 @@ cases <- data.frame(
 
 cases \%>\%
  adorn_percentages("col",,recovered:died) \%>\%
- adorn_pct_formatting(,,,recovered:died) \%>\%
- adorn_ns(,,recovered:died)
+ adorn_pct_formatting(,,,,,recovered:died) \%>\%
+ adorn_ns(,,,recovered:died)
   
 }
diff --git a/tests/testthat/test-adorn-ns.R b/tests/testthat/test-adorn-ns.R
index e94a0b85..e6126a6e 100644
--- a/tests/testthat/test-adorn-ns.R
+++ b/tests/testthat/test-adorn-ns.R
@@ -33,8 +33,8 @@ test_that("front parameter works", {
       untabyl(),
     data.frame(
       x = c(letters[1:4], "Total"),
-      `0` = c("417 (82.7%)", "  2  (0.4%)", "  1  (0.2%)", "  0  (0.0%)", "420 (83.3%)"),
-      `1` = c("83 (16.5%)", " 0  (0.0%)", " 0  (0.0%)", " 1  (0.2%)", "84 (16.7%)"),
+      `0` = c("417 (82.7%)", "2  (0.4%)", "1  (0.2%)", "0  (0.0%)", "420 (83.3%)"),
+      `1` = c("83 (16.5%)", "0  (0.0%)", "0  (0.0%)", "1  (0.2%)", "84 (16.7%)"),
       check.names = FALSE,
       stringsAsFactors = FALSE
     )
@@ -198,3 +198,42 @@ test_that("adorn_ns works on single column data.frame with custom Ns if tidysele
     adorn_ns(ns = dplyr::select(attr(adorned_single, "core"), a = `4`),,,, a)
   expect_equal(stringr::str_sub(adorned_single$a, -4, -1), c(" (3)", " (8)"))
 })
+
+# This tests the display of the decimal.mark by forcing a decimal into a tabyl
+# Can't happen with a natural table, but maybe someone will use adorn_ns on a homespun data.frame
+test_that("formatting function works, #444", {
+  set.seed(1)
+  bigger_dat <- data.frame(sex = rep(c("m", "f"), 3000),
+                           age = round(runif(3000, 1, 102), 0))
+  bigger_dat$age_group = cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1/3, 2/3, 1)))
+  
+  bigger_tab <- bigger_dat %>%
+    tabyl(age_group, sex, show_missing_levels = F)
+  
+  standard_output <- bigger_tab %>%
+    adorn_percentages("col") %>%
+    adorn_pct_formatting(digits = 1) %>% 
+    adorn_ns(position = "front")
+  
+  # test commas in thousands place by default
+  expect_equal(standard_output$f,
+               c("1,018 (33.9%)", "990 (33.0%)", "980 (32.7%)", "12  (0.4%)")
+               )
+  
+  # Test decimal mark
+  bigger_tab$f[1] <- 1018.5 # makes no sense in a tabyl but need for testing decimal mark display
+  
+  bigger_result <- bigger_tab %>%
+    untabyl() %>% # to get the decimal into the core
+    as_tabyl() %>% 
+    adorn_totals(c("row", "col")) %>%
+    adorn_percentages("col") %>%
+    adorn_pct_formatting(digits = 1) %>% 
+    adorn_ns(position = "rear", format_func = function(x) format(x, big.mark = ".", decimal.mark = ","))
+  
+  expect_equal(
+    bigger_result$f,
+    c("33.9% (1.018,5)", "33.0%   (990,0)", "32.7%   (980,0)", "0.4%    (12,0)", 
+      "100.0% (3.000,5)")
+  )
+})
\ No newline at end of file