some stuff

AnthonyTedde · Jan 14, 2020 · e9a95fd · e9a95fd
1 parent 4fd4f02
commit e9a95fd
Show file tree

Hide file tree

Showing 18 changed files with 446 additions and 19 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,3 +9,10 @@ Description: More about what it does (maybe more than one line)
 License: What license is it under?
 Encoding: UTF-8
 LazyData: true
+RoxygenNote: 6.1.1
+Imports: dplyr,
+          purrr,
+          caret,
+          tibble
+Suggests: 
+    testthat
diff --git a/NAMESPACE b/NAMESPACE
@@ -1 +1,9 @@
-exportPattern("^[[:alpha:]]+")
+# Generated by roxygen2: do not edit by hand
+
+export(GH_outlier)
+export(log_message)
+export(outliars)
+export(t_test_outlier)
+export(unscale)
+export(workout)
+importFrom(purrr,"%>%")
diff --git a/R/hello.R b/R/hello.R
diff --git a/R/log_message.R b/R/log_message.R
@@ -0,0 +1,21 @@
+#' Title
+#'
+#' @param message
+#' @param title
+#'
+#' @return
+#' @export
+#'
+#' @examples
+log_message <- function(message, title = T){
+
+  if(title){
+    cat(paste(rep("*", 50), collapse = ""), fill = T)
+    cat(message, fill = T)
+    cat(paste(rep("*", 50), collapse = ""), fill = T)
+  }else{
+    cat(message, fill = T)
+  }
+
+  flush.console()
+}
diff --git a/R/outliars.R b/R/outliars.R
@@ -0,0 +1,169 @@
+#' Outlier detection for quantitative data
+#'
+#' @param data Data.frame. Candidate data for outliers detection.
+#' @param std_err Integer. Which specifies how far from the mean a sample should be for being considered as an outlier.
+#' @param remove Boolean. Default TRUE. If TRUE, then the function returns a new dataset minus the assessed outliers. Otherwise, it returns a logical atomic vector specifying -  by the boolean value False - which data is an outlier.
+#' @param verbose Boolean. Default TRUE.  If TRUE, some of the intermediate results are output to the console.
+#'
+#' @return See remove argument.
+#' @export
+#'
+#' @examples
+outliars <- function(data,
+                     std_err = 3,
+                     remove = TRUE,
+                     verbose = TRUE){
+
+
+  outliars_subset <- data %>%
+    dplyr::select_if(is.numeric) %>%
+    purrr::map(
+      ~as.list(
+        mean(.x, na.rm = T) + c(-1, 1) * std_err * sd(.x, na.rm = T)) %>%
+        setNames(nm = c("lower_bound", "upper_bound")
+        )
+    ) %>%
+    purrr::imap(
+      ~data[[.y]] > .x$lower_bound & data[[.y]] < .x$upper_bound
+    ) %>%
+    purrr::reduce(`*`) %>% as.logical()
+
+
+  if(verbose){
+    message <- c(
+      paste0(sum(!outliars_subset), "/", length(outliars_subset)),
+      "rows removed"
+    )
+    utilitR::log_message(message = message, title = F)
+  }
+
+
+  if(remove)
+    dplyr::filter(data, outliars_subset)
+  else outliars_subset
+}
+
+#' Title
+#'
+#' @param formula depedent var ~ independent ones.
+#' @param data Candidate data.
+#' @param method List of methods to apply.
+#' @param ...
+#'
+#' @return a logical atomic vector of row outliers based on the combination of
+#' all performed t-test outliers detection.
+#' @export
+#'
+#' @examples
+t_test_outlier <- function(formula, data, method, verbose = T, ...){
+
+
+  ###################################
+  # Create the formula based on y
+  ###################################
+
+  f <- dplyr::enquo(formula) %>%
+    dplyr::as_label() %>% formula
+
+  method %>% purrr::map(.f = function(m){
+
+    outliers <- rep(T, nrow(data))
+    ctrl <- caret::trainControl(method = "cv")
+
+    if(verbose)
+      utilitR::log_message(message = m$method)
+
+    repeat{
+
+      arguments <- c(list(
+        form = f,
+        data = data[outliers, ],
+        preProcess = c("center", "scale", "nzv"),
+        trControl = ctrl
+      ), m)
+
+      data_model <- do.call(caret::train, arguments)
+
+      data_prediction <- predict(data_model, data[outliers, ])
+      residuals <- data_model$trainingData$.outcome - data_prediction
+
+      outliers_current <- utilitR::outliars(data.frame(residuals), remove = F)
+
+      if(all(outliers_current)) break
+      else outliers[which(outliers)[which(!outliers_current)]] <- F
+
+
+    } # End of the repeat loop
+    return(outliers)
+  }) %>%
+    Reduce(f = `*`) %>% as.logical
+}
+
+#' Remove row outliers based on the standardized Mahalanobis distance of
+#' data points.
+#'
+#' @inheritParams outliars
+#' @param std_err Threshold above which data is considered as outlier.
+#' @param ... Argument to pass through caret::preProcess function. e.g.
+#' thresh. (see caret::preProcess)
+#'
+#' @return
+#' @export
+#'
+#' @examples
+GH_outlier <- function(data,
+                       variables = dplyr::everything(),
+                       std_err = 3,
+                       remove = TRUE,
+                       verbose = TRUE,
+                       ...){
+
+
+  ###################################
+  # data management and preprocess
+  ###################################
+
+  data <- data %>%
+    dplyr::select(variables)
+
+  data_pca_preproc <- data %>%
+    caret::preProcess(method = c("center", "scale", "nzv", "pca"), ...)
+
+  data_pca <- predict(data_pca_preproc, data) %>%
+    tibble::as_tibble()
+
+
+  ###################################
+  # Mahalanobis and GH
+  ###################################
+
+  pc_mahalanobis <- data_pca %>%
+    mahalanobis(., center = colMeans(.), cov = cov(.))
+
+  pc_GH <- pc_mahalanobis / ncol(data_pca)
+  row_to_keep <- pc_GH < std_err
+
+
+  ###################################
+  # log Message
+  ###################################
+
+  if(verbose){
+    line1 = paste0("Outliers: ", sum(!row_to_keep))
+    line2 = paste0("Proportion: ", sum(!row_to_keep) / length(row_to_keep))
+    message = paste(line1, line2, sep = "\n")
+    utilitR::log_message(message = message)
+  }
+
+
+  ###################################
+  # remove -> return the new dataset.
+  ###################################
+
+  if(remove)
+    data %>%
+    dplyr::filter(row_to_keep) %>%
+    tibble::as_tibble()
+  else row_to_keep
+
+}
diff --git a/R/unscale.R b/R/unscale.R
@@ -0,0 +1,16 @@
+#' Unscaled scaled data according to the former ones.
+#'
+#' @param scaled_data The data to by unscaled.
+#' @param template The template data for unscaling.
+#'
+#' @return A data.frame of unscaled data.
+#' @export
+#'
+#' @examples
+unscale <- function(scaled_data, template){
+  purrr::pmap_dfc(list(scaled_data, template), .f = function(s, t){
+    if(is.numeric(s))
+      s * sd(t, na.rm = T) + mean(t, na.rm = T)
+    else s
+  })
+}
diff --git a/R/utilitR.R b/R/utilitR.R
@@ -0,0 +1,5 @@
+#' @keywords internal
+"_PACKAGE"
+
+#' @importFrom purrr %>%
+NULL
diff --git a/R/workout.R b/R/workout.R
@@ -0,0 +1,41 @@
+#' Workout
+#'
+#' @description A function to perform models training accross with different
+#' data. Simultaneously.
+#'
+#' @param args All the argumnets to pass through the caret::train function
+#' @param seed A seed. For reproducibility.
+#' @param subset The row subset of data to use for the analysis. All by default.
+#' @param features A subset of regressors to use. All by default.
+#'
+#' @return A list of caret object.
+#'
+#' @export
+#'
+#' @examples
+workout <- function(args, seed = 1010, subset = T, features = T){
+  args %>% purrr::map(.f = function(arg){
+    cat(arg$method)
+    cat(arg$data)
+
+    ## Get the data from character vector denoting the variable name
+    arg$data <- get(arg$data)[subset, ]
+
+    set.seed(seed)
+    do.call(caret::train, arg)
+  })
+}
+
+workout_prepare <- function(data_lst, method_lst, method_parameter_lst){
+  method_name <- names(method_lst)
+  data_name <- names(data_lst)
+  mdls_names <- do.call(paste,
+                        c(expand.grid(data_name, method_name), sep = "_"))
+  args <- purrr::cross3(data_lst, method_lst, method_parameter_lst)
+  # %>%
+  #   purrr::map(purrr::flatten)
+  return(list(
+    args = args,
+    mdls_names = mdls_names
+  ))
+}
diff --git a/man/GH_outlier.Rd b/man/GH_outlier.Rd
diff --git a/man/log_message.Rd b/man/log_message.Rd
diff --git a/man/outliars.Rd b/man/outliars.Rd
diff --git a/man/t_test_outlier.Rd b/man/t_test_outlier.Rd