Skip to content

Commit

Permalink
some stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
AnthonyTedde committed Jan 14, 2020
1 parent 4fd4f02 commit e9a95fd
Show file tree
Hide file tree
Showing 18 changed files with 446 additions and 19 deletions.
7 changes: 7 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,10 @@ Description: More about what it does (maybe more than one line)
License: What license is it under?
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.1
Imports: dplyr,
purrr,
caret,
tibble
Suggests:
testthat
10 changes: 9 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
exportPattern("^[[:alpha:]]+")
# Generated by roxygen2: do not edit by hand

export(GH_outlier)
export(log_message)
export(outliars)
export(t_test_outlier)
export(unscale)
export(workout)
importFrom(purrr,"%>%")
18 changes: 0 additions & 18 deletions R/hello.R

This file was deleted.

21 changes: 21 additions & 0 deletions R/log_message.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#' Title
#'
#' @param message
#' @param title
#'
#' @return
#' @export
#'
#' @examples
log_message <- function(message, title = T){

if(title){
cat(paste(rep("*", 50), collapse = ""), fill = T)
cat(message, fill = T)
cat(paste(rep("*", 50), collapse = ""), fill = T)
}else{
cat(message, fill = T)
}

flush.console()
}
169 changes: 169 additions & 0 deletions R/outliars.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#' Outlier detection for quantitative data
#'
#' @param data Data.frame. Candidate data for outliers detection.
#' @param std_err Integer. Which specifies how far from the mean a sample should be for being considered as an outlier.
#' @param remove Boolean. Default TRUE. If TRUE, then the function returns a new dataset minus the assessed outliers. Otherwise, it returns a logical atomic vector specifying - by the boolean value False - which data is an outlier.
#' @param verbose Boolean. Default TRUE. If TRUE, some of the intermediate results are output to the console.
#'
#' @return See remove argument.
#' @export
#'
#' @examples
outliars <- function(data,
std_err = 3,
remove = TRUE,
verbose = TRUE){


outliars_subset <- data %>%
dplyr::select_if(is.numeric) %>%
purrr::map(
~as.list(
mean(.x, na.rm = T) + c(-1, 1) * std_err * sd(.x, na.rm = T)) %>%
setNames(nm = c("lower_bound", "upper_bound")
)
) %>%
purrr::imap(
~data[[.y]] > .x$lower_bound & data[[.y]] < .x$upper_bound
) %>%
purrr::reduce(`*`) %>% as.logical()


if(verbose){
message <- c(
paste0(sum(!outliars_subset), "/", length(outliars_subset)),
"rows removed"
)
utilitR::log_message(message = message, title = F)
}


if(remove)
dplyr::filter(data, outliars_subset)
else outliars_subset
}

#' Title
#'
#' @param formula depedent var ~ independent ones.
#' @param data Candidate data.
#' @param method List of methods to apply.
#' @param ...
#'
#' @return a logical atomic vector of row outliers based on the combination of
#' all performed t-test outliers detection.
#' @export
#'
#' @examples
t_test_outlier <- function(formula, data, method, verbose = T, ...){


###################################
# Create the formula based on y
###################################

f <- dplyr::enquo(formula) %>%
dplyr::as_label() %>% formula

method %>% purrr::map(.f = function(m){

outliers <- rep(T, nrow(data))
ctrl <- caret::trainControl(method = "cv")

if(verbose)
utilitR::log_message(message = m$method)

repeat{

arguments <- c(list(
form = f,
data = data[outliers, ],
preProcess = c("center", "scale", "nzv"),
trControl = ctrl
), m)

data_model <- do.call(caret::train, arguments)

data_prediction <- predict(data_model, data[outliers, ])
residuals <- data_model$trainingData$.outcome - data_prediction

outliers_current <- utilitR::outliars(data.frame(residuals), remove = F)

if(all(outliers_current)) break
else outliers[which(outliers)[which(!outliers_current)]] <- F


} # End of the repeat loop
return(outliers)
}) %>%
Reduce(f = `*`) %>% as.logical
}

#' Remove row outliers based on the standardized Mahalanobis distance of
#' data points.
#'
#' @inheritParams outliars
#' @param std_err Threshold above which data is considered as outlier.
#' @param ... Argument to pass through caret::preProcess function. e.g.
#' thresh. (see caret::preProcess)
#'
#' @return
#' @export
#'
#' @examples
GH_outlier <- function(data,
variables = dplyr::everything(),
std_err = 3,
remove = TRUE,
verbose = TRUE,
...){


###################################
# data management and preprocess
###################################

data <- data %>%
dplyr::select(variables)

data_pca_preproc <- data %>%
caret::preProcess(method = c("center", "scale", "nzv", "pca"), ...)

data_pca <- predict(data_pca_preproc, data) %>%
tibble::as_tibble()


###################################
# Mahalanobis and GH
###################################

pc_mahalanobis <- data_pca %>%
mahalanobis(., center = colMeans(.), cov = cov(.))

pc_GH <- pc_mahalanobis / ncol(data_pca)
row_to_keep <- pc_GH < std_err


###################################
# log Message
###################################

if(verbose){
line1 = paste0("Outliers: ", sum(!row_to_keep))
line2 = paste0("Proportion: ", sum(!row_to_keep) / length(row_to_keep))
message = paste(line1, line2, sep = "\n")
utilitR::log_message(message = message)
}


###################################
# remove -> return the new dataset.
###################################

if(remove)
data %>%
dplyr::filter(row_to_keep) %>%
tibble::as_tibble()
else row_to_keep

}
16 changes: 16 additions & 0 deletions R/unscale.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#' Unscaled scaled data according to the former ones.
#'
#' @param scaled_data The data to by unscaled.
#' @param template The template data for unscaling.
#'
#' @return A data.frame of unscaled data.
#' @export
#'
#' @examples
unscale <- function(scaled_data, template){
purrr::pmap_dfc(list(scaled_data, template), .f = function(s, t){
if(is.numeric(s))
s * sd(t, na.rm = T) + mean(t, na.rm = T)
else s
})
}
5 changes: 5 additions & 0 deletions R/utilitR.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#' @keywords internal
"_PACKAGE"

#' @importFrom purrr %>%
NULL
41 changes: 41 additions & 0 deletions R/workout.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#' Workout
#'
#' @description A function to perform models training accross with different
#' data. Simultaneously.
#'
#' @param args All the argumnets to pass through the caret::train function
#' @param seed A seed. For reproducibility.
#' @param subset The row subset of data to use for the analysis. All by default.
#' @param features A subset of regressors to use. All by default.
#'
#' @return A list of caret object.
#'
#' @export
#'
#' @examples
workout <- function(args, seed = 1010, subset = T, features = T){
args %>% purrr::map(.f = function(arg){
cat(arg$method)
cat(arg$data)

## Get the data from character vector denoting the variable name
arg$data <- get(arg$data)[subset, ]

set.seed(seed)
do.call(caret::train, arg)
})
}

workout_prepare <- function(data_lst, method_lst, method_parameter_lst){
method_name <- names(method_lst)
data_name <- names(data_lst)
mdls_names <- do.call(paste,
c(expand.grid(data_name, method_name), sep = "_"))
args <- purrr::cross3(data_lst, method_lst, method_parameter_lst)
# %>%
# purrr::map(purrr::flatten)
return(list(
args = args,
mdls_names = mdls_names
))
}
26 changes: 26 additions & 0 deletions man/GH_outlier.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions man/log_message.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/outliars.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions man/t_test_outlier.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit e9a95fd

Please sign in to comment.