diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 7c7151e..f89e8c4 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -260,7 +260,7 @@ jobs: run: | ## Temporary workaround for https://github.com/actions/checkout/issues/766 git config --global --add safe.directory "$GITHUB_WORKSPACE" - + git config --local user.email "actions@github.com" git config --local user.name "GitHub Actions" Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)" @@ -276,3 +276,4 @@ jobs: with: name: ${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-results path: check + diff --git a/DESCRIPTION b/DESCRIPTION index 8c0fe82..0f94ff9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,7 +28,8 @@ Encoding: UTF-8 LazyData: true Biarch: true Depends: - R (>= 4.1.0) + R (>= 4.1.0), + rstan (>= 2.18.1) Imports: benchmarkme, dplyr, @@ -44,7 +45,6 @@ Imports: Rcpp (>= 0.12.0), RcppParallel (>= 5.0.1), rlang, - rstan (>= 2.18.1), rstantools (>= 2.1.1), stats, tibble, diff --git a/NAMESPACE b/NAMESPACE index 87a9d51..33c10eb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,8 +16,6 @@ importFrom(RcppParallel,CxxFlags) importFrom(RcppParallel,RcppParallelLibs) importFrom(benchmarkme,get_ram) importFrom(dplyr,enquo) -importFrom(edgeR,calcNormFactors) -importFrom(edgeR,filterByExpr) importFrom(foreach,"%do%") importFrom(foreach,foreach) importFrom(graphics,par) @@ -33,12 +31,11 @@ importFrom(purrr,map2) importFrom(purrr,map_chr) importFrom(purrr,map_int) importFrom(purrr,pmap) +importFrom(purrr,reduce) importFrom(purrr,when) importFrom(rlang,":=") -importFrom(rlang,enquo) importFrom(rlang,quo_is_null) importFrom(rlang,quo_is_symbol) -importFrom(rstan,extract) importFrom(rstan,sampling) importFrom(rstan,summary) importFrom(rstan,vb) @@ -64,6 +61,7 @@ importFrom(tidyr,separate) importFrom(tidyr,spread) importFrom(tidyr,unnest) importFrom(utils,head) +importFrom(utils,install.packages) importFrom(utils,packageVersion) importFrom(utils,tail) useDynLib(ppcseq, .registration = TRUE) diff --git a/R/ppcseq.R b/R/methods.R similarity index 84% rename from R/ppcseq.R rename to R/methods.R index 8ff2a1e..33d1047 100644 --- a/R/ppcseq.R +++ b/R/methods.R @@ -28,6 +28,7 @@ #' @param .abundance A column name as symbol. The transcript abundance (read count) #' @param .significance A column name as symbol. A column with the Pvalue, or other significance measure (preferred Pvalue over false discovery rate) #' @param .do_check A column name as symbol. A column with a boolean indicating whether a transcript was identified as differentially abundant +#' @param .scaling_factor In case the scaling factor must not be calculated (TMM method) using the input data but provided. It is useful, for example, for pseudobulk single-cell where the scaling might depend on sample sequencing depth for all cells rather than a particular cell type. #' @param percent_false_positive_genes A real between 0 and 100. It is the aimed percent of transcript being a false positive. For example, percent_false_positive_genes = 1 provide 1 percent of the calls for outlier containing transcripts that has actually not outliers. #' @param approximate_posterior_inference A boolean. Whether the inference of the joint posterior distribution should be approximated with variational Bayes It confers execution time advantage. #' @param approximate_posterior_analysis A boolean. Whether the calculation of the credible intervals should be done semi-analytically, rather than with pure sampling from the posterior. It confers execution time and memory advantage. @@ -43,7 +44,7 @@ #' @param seed An integer. Used for development and testing purposes #' @param adj_prob_theshold_2 A boolean. Used for development and testing purposes #' -#' @return A nested tibble `tbl` with transcript-wise information: `sample wise data` | plot | `ppc samples failed` | `tot deleterious outliers` +#' @return A nested tibble `tbl` with transcript-wise information: `sample_wise_data` | plot | `ppc samples failed` | `tot deleterious_outliers` #' #' @examples #' @@ -77,6 +78,7 @@ identify_outliers = function(.data, .abundance, .significance, .do_check, + .scaling_factor = NULL, percent_false_positive_genes = 1, how_many_negative_controls = 500, @@ -100,15 +102,16 @@ identify_outliers = function(.data, .abundance = enquo(.abundance) .significance = enquo(.significance) .do_check = enquo(.do_check) + .scaling_factor = enquo(.scaling_factor) # Get factor of interest #factor_of_interest = ifelse(parse_formula(formula) %>% length %>% `>` (0), parse_formula(formula)[1], "") # Check if columns exist - check_columns_exist(.data, !!.sample, !!.transcript, !!.abundance, !!.significance, !!.do_check) + check_columns_exist(.data, !!.sample, !!.transcript, !!.abundance, !!.significance) # Check if any column is NA or null - check_if_any_NA(.data, !!.sample, !!.transcript, !!.abundance, !!.significance, !!.do_check, parse_formula(formula)) + check_if_any_NA(.data, c(quo_name(.sample), quo_name(.transcript), quo_name(.abundance), quo_name(.significance), parse_formula(formula))) # Check if I have any genes to check if(.data %>% filter(!!.do_check) %>% nrow %>% equals(0)){ @@ -120,7 +123,7 @@ identify_outliers = function(.data, d = 1L ) %>% slice(0) %>% - setNames(c(quo_name(.transcript), "sample wise data", "ppc samples failed", "tot deleterious outliers"))) + setNames(c(quo_name(.transcript), "sample_wise_data", "ppc samples failed", "tot deleterious_outliers"))) } @@ -193,13 +196,15 @@ identify_outliers = function(.data, # distinct_at is not released yet for dplyr, thus we have to use this trick - my_df <- format_input( - .data, + my_df <- + .data %>% + mutate(do_check___ = !!.do_check) %>% + format_input( formula, !!.sample, !!.transcript, !!.abundance, - !!.do_check, + do_check___, !!.significance, how_many_negative_controls ) @@ -212,46 +217,65 @@ identify_outliers = function(.data, # Prior info lambda_mu_mu = 5.612671 - # Scale dataset - my_df_scaled = - my_df %>% - .identify_abundant(!!.sample,!!.transcript,!!.abundance) %>% - get_scaled_counts_bulk(!!.sample,!!.transcript,!!.abundance) %>% - left_join(my_df, by=quo_name(.sample)) %>% - dplyr::mutate(!!as.symbol(sprintf("%s_scaled", quo_name(.abundance))) := !!.abundance * multiplier) - - # Build better scales for the inference - exposure_rate_multiplier = - my_df_scaled %>% - distinct(!!.sample, TMM, multiplier) %>% - mutate(l = multiplier %>% log) %>% - summarise(l %>% sd) %>% - pull(`l %>% sd`) - - # Build better scales for the inference - intercept_shift_scale = - my_df_scaled %>% - mutate(cc = - !!as.symbol(sprintf( - "%s_scaled", quo_name(.abundance) - )) %>% - `+` (1) %>% log) %>% - summarise(shift = cc %>% mean, scale = cc %>% sd) %>% - as.numeric + + # Find normalisation + if(quo_is_null(.scaling_factor)) + sample_scaling = + my_df %>% + .identify_abundant(!!.sample,!!.transcript,!!.abundance) %>% + get_scaled_counts_bulk(!!.sample,!!.transcript,!!.abundance) %>% + distinct(!!.sample, multiplier) + else + sample_scaling = + .data %>% + distinct(!!.sample, multiplier= !!.scaling_factor) %>% + distinct() + + sample_exposure = + sample_scaling %>% + mutate(exposure_rate = -log(multiplier)) %>% + mutate(exposure_multiplier = exp(exposure_rate)) %>% + rename(multiplier := !!.scaling_factor) + + # # Scale dataset + # my_df_scaled = + # my_df %>% + # .identify_abundant(!!.sample,!!.transcript,!!.abundance) %>% + # get_scaled_counts_bulk(!!.sample,!!.transcript,!!.abundance) %>% + # left_join(my_df, by=quo_name(.sample)) %>% + # dplyr::mutate(!!as.symbol(sprintf("%s_scaled", quo_name(.abundance))) := !!.abundance * multiplier) + + # # Build better scales for the inference + # exposure_rate_multiplier = + # my_df_scaled %>% + # distinct(!!.sample, TMM, multiplier) %>% + # mutate(l = multiplier %>% log) %>% + # summarise(l %>% sd) %>% + # pull(`l %>% sd`) + # + # # Build better scales for the inference + # intercept_shift_scale = + # my_df_scaled %>% + # mutate(cc = + # !!as.symbol(sprintf( + # "%s_scaled", quo_name(.abundance) + # )) %>% + # `+` (1) %>% log) %>% + # summarise(shift = cc %>% mean, scale = cc %>% sd) %>% + # as.numeric # Run the first discovery phase with permissive false discovery rate res_discovery = my_df %>% do_inference( - formula,!!.sample ,!!.transcript ,!!.abundance ,!!.significance ,!!.do_check, + formula,!!.sample ,!!.transcript ,!!.abundance ,!!.significance ,do_check___, approximate_posterior_inference, approximate_posterior_analysis = FALSE, C, X, lambda_mu_mu, cores, - exposure_rate_multiplier, - intercept_shift_scale, + sample_exposure, additional_parameters_to_save, adj_prob_theshold = adj_prob_theshold_1, how_many_posterior_draws = how_many_posterior_draws_1, @@ -270,7 +294,7 @@ identify_outliers = function(.data, filter(`.variable` == "counts_rng") %>% ifelse_pipe( do_check_only_on_detrimental, - ~ .x %>% filter(`deleterious outliers`), + ~ .x %>% filter(`deleterious_outliers`), ~ .x %>% filter(!ppc) ) %>% distinct(S, G, .lower, .upper) @@ -296,15 +320,14 @@ identify_outliers = function(.data, res_test = my_df %>% do_inference( - formula,!!.sample ,!!.transcript ,!!.abundance ,!!.significance ,!!.do_check, + formula,!!.sample ,!!.transcript ,!!.abundance ,!!.significance ,do_check___, approximate_posterior_inference, approximate_posterior_analysis, C, X, lambda_mu_mu, cores, - exposure_rate_multiplier, - intercept_shift_scale, + sample_exposure, additional_parameters_to_save, adj_prob_theshold = adj_prob_theshold_2, # If check only deleterious is one side test # * 2 because we just test one side of the distribution @@ -320,23 +343,9 @@ identify_outliers = function(.data, # Merge results and return merge_results( - - # Calculate CI 2 for discovery for plotting res_discovery, - - # %>% - # left_join( - # (.) %>% - # attr("fit") %>% - # fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>% - # select(S, G, .lower_1 = .lower, .upper_1 = .upper) - # ), - - res_test, - formula, - !!.transcript, - !!.abundance, - !!.sample, + res_test %>% left_join(sample_exposure, by = quo_name(.sample)) , + formula,!!.transcript,!!.abundance,!!.sample, do_check_only_on_detrimental ) %>% @@ -409,7 +418,7 @@ plot_credible_intervals = function(.data){ mutate(plot = pmap( list( - `sample wise data`, + `sample_wise_data`, !!as.symbol(.transcript), # nested data for plot .abundance, diff --git a/R/tidybulk.R b/R/tidybulk.R index 5e3a1d7..ca36791 100644 --- a/R/tidybulk.R +++ b/R/tidybulk.R @@ -1,16 +1,15 @@ #' Drop lowly transcribed genes for TMM normalization #' #' @keywords internal +#' @noRd #' -#' @import dplyr -#' @importFrom tidyr spread +#' @importFrom tidyr spread +#' @importFrom tidyr drop_na #' @import tibble #' @importFrom rlang := #' @importFrom stats median -#' @importFrom edgeR filterByExpr -#' @importFrom rlang enquo -#' @importFrom rlang quo_is_symbol #' @importFrom purrr when +#' @importFrom rlang quo_is_symbol #' #' @param .data A tibble #' @param .sample The name of the sample column @@ -33,6 +32,7 @@ add_scaled_counts_bulk.get_low_expressed <- function(.data, .transcript = enquo(.transcript) .abundance = enquo(.abundance) + factor_of_interest = enquo(factor_of_interest) # Check if factor_of_interest is continuous and exists @@ -40,20 +40,15 @@ add_scaled_counts_bulk.get_low_expressed <- function(.data, factor_of_interest %>% when( - quo_is_symbol(.) && - select(.data, !!(.)) %>% - lapply(class) %>% - as.character() %in% c("numeric", "integer", "double") ~ { + select(.data, !!(.)) %>% lapply(class) %>% as.character() %in% c("numeric", "integer", "double") ~ { message("ppcseq says: The factor of interest is continuous (e.g., integer,numeric, double). The data will be filtered without grouping.") NULL }, - quo_is_symbol(.) ~ .data %>% distinct(!!.sample, !!factor_of_interest) %>% arrange(!!.sample) %>% pull(!!factor_of_interest), - ~ NULL ) @@ -77,7 +72,7 @@ add_scaled_counts_bulk.get_low_expressed <- function(.data, # Call edgeR as_matrix(rownames = !!.transcript) %>% - filterByExpr( + edgeR::filterByExpr( min.count = minimum_counts, group = string_factor_of_interest, min.prop = minimum_proportion @@ -88,7 +83,6 @@ add_scaled_counts_bulk.get_low_expressed <- function(.data, } # Set internal -#' @importFrom purrr when .identify_abundant = function(.data, .sample = NULL, .transcript = NULL, @@ -102,38 +96,45 @@ add_scaled_counts_bulk.get_low_expressed <- function(.data, .transcript = enquo(.transcript) .abundance = enquo(.abundance) + factor_of_interest = enquo(factor_of_interest) .data %>% - { - gene_to_exclude = - add_scaled_counts_bulk.get_low_expressed( - ., - .sample = !!.sample, - .transcript = !!.transcript, - .abundance = !!.abundance, - factor_of_interest = !!factor_of_interest, - minimum_counts = minimum_counts, - minimum_proportion = minimum_proportion - ) - - dplyr::mutate(., .abundant := !!.transcript %in% gene_to_exclude %>% not()) - } + # Filter + when( + + # If column is present use this instead of doing more work + ".abundant" %in% colnames(.) %>% not ~ { + gene_to_exclude = + add_scaled_counts_bulk.get_low_expressed( + .data, + .sample = !!.sample, + .transcript = !!.transcript, + .abundance = !!.abundance, + factor_of_interest = !!factor_of_interest, + minimum_counts = minimum_counts, + minimum_proportion = minimum_proportion + ) + + dplyr::mutate(., .abundant := !!.transcript %in% gene_to_exclude %>% not()) + }, + ~ (.) + ) } #' Get a tibble with scaled counts using TMM #' #' @keywords internal +#' @noRd #' #' @import dplyr #' @import tibble #' @importFrom magrittr equals #' @importFrom rlang := #' @importFrom stats median -#' @importFrom purrr when -#' @importFrom utils head +#' @importFrom utils install.packages #' #' @param .data A tibble #' @param .sample The name of the sample column @@ -142,6 +143,7 @@ add_scaled_counts_bulk.get_low_expressed <- function(.data, #' @param method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") #' @param reference_sample A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference. #' +#' #' @return A tibble including additional columns #' #' @@ -150,21 +152,29 @@ get_scaled_counts_bulk <- function(.data, .transcript = NULL, .abundance = NULL, method = "TMM", - reference_sample = NULL) { + reference_sample = NULL, + .library_size = NULL) { # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) - # Reformat input data set - df <- - .data %>% + .library_size = enquo(.library_size) - # Rename - dplyr::select(!!.sample,!!.transcript,!!.abundance) %>% + # # Check if package is installed, otherwise install + # if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { + # message("ppcseq says: Installing edgeR needed for analyses") + # if (!requireNamespace("BiocManager", quietly = TRUE)) + # install.packages("BiocManager", repos = "https://cloud.r-project.org") + # BiocManager::install("edgeR", ask = FALSE) + # } + + # Set factors + .data = + .data %>% + dplyr::mutate(!!.sample := factor(!!.sample),!!.transcript := factor(!!.transcript)) %>% + droplevels() - # Set samples and genes as factors - dplyr::mutate(!!.sample := factor(!!.sample),!!.transcript := factor(!!.transcript)) # Get reference @@ -174,7 +184,7 @@ get_scaled_counts_bulk <- function(.data, !is.null(.) ~ (.), # If not specified take most abundance sample - df %>% + .data %>% group_by(!!.sample) %>% summarise(sum = median(!!.abundance)) %>% mutate(med = max(sum)) %>% @@ -188,18 +198,19 @@ get_scaled_counts_bulk <- function(.data, nf_obj <- add_scaled_counts_bulk.calcNormFactor( - df, + .data, reference, .sample = !!.sample, .transcript = !!.transcript, .abundance = !!.abundance, - method + method, + .library_size = !!.library_size ) # Calculate normalization factors nf_obj$nf %>% dplyr::left_join( - df %>% + .data %>% group_by(!!.sample) %>% summarise(tot = sum(!!.abundance, na.rm = TRUE)) %>% ungroup() %>% @@ -232,14 +243,12 @@ get_scaled_counts_bulk <- function(.data, #' Calculate the norm factor with calcNormFactor from limma #' #' @keywords internal +#' @noRd #' #' @import dplyr -#' @importFrom tidyr spread -#' @importFrom tidyr drop_na #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames -#' @importFrom edgeR calcNormFactors #' #' @param .data A tibble #' @param reference A reference matrix, not sure if used anymore @@ -255,26 +264,34 @@ add_scaled_counts_bulk.calcNormFactor <- function(.data, .sample = `sample`, .transcript = `transcript`, .abundance = `count`, - method) { + method, + .library_size = NULL) { .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) - # Get data frame for the highly transcribed transcripts - df.filt <- + .library_size = enquo(.library_size) + + # Force or not library size + lib.size = .data %>% - # dplyr::filter(!(!!.transcript %in% gene_to_exclude)) %>% - droplevels() %>% - select(!!.sample, !!.transcript, !!.abundance) + when( + !quo_is_null(.library_size) ~ distinct(., !!.sample, !!.library_size) %>% arrange(!!.sample) %>% pull(!!.library_size), + ~ NULL + ) + + # Get data frame for the highly transcribed transcripts + df.filt <- .data %>% select(!!.sample, !!.transcript, !!.abundance) # scaled data set nf = tibble::tibble( + # Sample factor sample = factor(levels(df.filt %>% pull(!!.sample))), # scaled data frame - nf = calcNormFactors( + nf = edgeR::calcNormFactors( df.filt %>% tidyr::spread(!!.sample,!!.abundance) %>% tidyr::drop_na() %>% @@ -282,7 +299,8 @@ add_scaled_counts_bulk.calcNormFactor <- function(.data, refColumn = which(reference == factor(levels( df.filt %>% pull(!!.sample) ))), - method = method + method = method, + lib.size = lib.size ) ) %>% diff --git a/R/utilities.R b/R/utilities.R index 57d66a9..2d184d2 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -168,7 +168,7 @@ format_for_MPI = function(df, shards, .sample) { # Add counts MPI rows indexes group_by(idx_MPI) %>% arrange(G) %>% - mutate(`read count MPI row` = seq_len(length.out=n())) %>% + mutate(`read_count_MPI_row` = seq_len(length.out=n())) %>% ungroup } @@ -259,7 +259,7 @@ vb_iterative = function(model, iter = iter, tol_rel_obj = tol_rel_obj, #seed = 654321, - pars=c("counts_rng", "exposure_rate", "alpha_sub_1", additional_parameters_to_save), + pars=c("counts_rng", "alpha_sub_1", additional_parameters_to_save), ... ) boolFalse <- TRUE @@ -290,10 +290,13 @@ vb_iterative = function(model, #' @noRd find_optimal_number_of_chains = function(how_many_posterior_draws, max_number_to_check = 100) { - foreach(cc = 2:max_number_to_check, .combine = bind_rows) %do% - { - tibble(chains = cc, tot = how_many_posterior_draws / cc + 150 * cc) - } %>% + + + 2:max_number_to_check %>% + map( + ~ tibble(chains = .x, tot = how_many_posterior_draws / .x + 150 * .x) + ) %>% + reduce(bind_rows) %>% filter(tot == tot %>% min) %>% pull(chains) @@ -307,6 +310,7 @@ find_optimal_number_of_chains = function(how_many_posterior_draws, #' #' @importFrom tibble rowid_to_column #' @importFrom purrr map +#' @importFrom purrr reduce #' #' @param counts_MPI A matrix of read count information #' @param to_exclude A vector of oulier data points to exclude @@ -315,38 +319,43 @@ find_optimal_number_of_chains = function(how_many_posterior_draws, #' @return A matrix #' @noRd get_outlier_data_to_exlude = function(counts_MPI, to_exclude, shards) { - # If there are genes to exclude - switch( - to_exclude %>% nrow %>% gt(0) %>% `!` %>% sum(1), - foreach(s = seq_len(length.out=shards), .combine = full_join) %do% { - counts_MPI %>% - inner_join(to_exclude, by = c("S", "G")) %>% - filter(idx_MPI == s) %>% - distinct(idx_MPI, `read count MPI row`) %>% - rowid_to_column %>% - spread(idx_MPI, `read count MPI row`) %>% - - # If a shard is empty create a dummy data set to avoid error - ifelse_pipe((.) %>% nrow == 0, ~ tibble(rowid = 1,!!as.symbol(s) := NA)) - - } %>% - - # Anonymous function - Add length array to the first row for indexing in MPI - # Input: tibble - # Output: tibble - { - bind_rows((.) %>% map(function(x) - x %>% is.na %>% `!` %>% as.numeric %>% sum) %>% unlist, - (.)) - } %>% - - select(-rowid) %>% - replace(is.na(.), 0 %>% as.integer) %>% - as_matrix() %>% t, - - # Otherwise - matrix(rep(0, shards)) - ) + + + to_exclude %>% + when( + # If there are genes to exclude + nrow(.) %>% gt(0) ~ seq_len(length.out = shards) %>% + map(~ { + s = .x + counts_MPI %>% + inner_join(to_exclude, by = c("S", "G")) %>% + filter(idx_MPI == s) %>% + distinct(idx_MPI, `read_count_MPI_row`) %>% + rowid_to_column %>% + spread(idx_MPI, `read_count_MPI_row`) %>% + + # If a shard is empty create a dummy data set to avoid error + ifelse_pipe((.) %>% nrow == 0, ~ tibble(rowid = 1, !!as.symbol(s) := NA)) + }) %>% + reduce(full_join, by = "rowid") %>% + + # Anonymous function - Add length array to the first row for indexing in MPI + # Input: tibble + # Output: tibble + { + bind_rows((.) %>% map(function(x) + x %>% is.na %>% `!` %>% as.numeric %>% sum) %>% unlist, + (.)) + } %>% + + select(-rowid) %>% + replace(is.na(.), 0 %>% as.integer) %>% + as_matrix() %>% t, + + # Otherwise + ~ matrix(rep(0, shards)) + ) + } #' function to pass initialisation values @@ -431,7 +440,12 @@ produce_plots = function(.x, )) ) - max_y = .x %>% summarise(a = max(!!as.symbol(.abundance)), b = max(.upper_2)) %>% as.numeric %>% max + max_y = .x %>% summarise(a = max(!!as.symbol(.abundance)), b = max(.upper)) %>% as.numeric %>% max + + # Scale + # .x = + # .x %>% + # mutate(across(c(.abundance, .lower_2, .upper_2), ~ .x * multiplier) ) { ggplot(data = .x, aes( @@ -448,11 +462,11 @@ produce_plots = function(.x, # ) } %>% when( - ".lower_2" %in% colnames(.x) ~ (.) + + ".lower" %in% colnames(.x) ~ (.) + geom_errorbar(aes( - ymin = `.lower_2`, - ymax = `.upper_2`, - color = `deleterious outliers` + ymin = `.lower`, + ymax = `.upper`, + color = `deleterious_outliers` ), width = 0), ~ (.) @@ -461,10 +475,10 @@ produce_plots = function(.x, ifelse_pipe( covariate %>% is.null %>% `!`, ~ .x + geom_point(aes( - size = `exposure rate`, fill = !!as.symbol(covariate) + size = exposure_rate, fill = !!as.symbol(covariate) ), shape = 21), ~ .x + geom_point( - aes(size = `exposure rate`), + aes(size = exposure_rate), shape = 21, fill = "black" ) @@ -485,14 +499,14 @@ add_deleterious_if_covariate_exists = function(.data, X){ select(2) %>% setNames("factor or interest") %>% mutate(S = seq_len(length.out=n())) %>% - mutate(`is group right` = `factor or interest` > mean(`factor or interest`)) , + mutate(`is_group_right` = `factor or interest` > mean(`factor or interest`)) , by = "S" ) %>% - mutate(`is group high` = (slope > 0 & `is group right`) | (slope < 0 & !`is group right`)) %>% + mutate(`is group high` = (slope > 0 & `is_group_right`) | (slope < 0 & !`is_group_right`)) %>% # Check if outlier might be deleterious for the statistics - mutate(`deleterious outliers` = (!ppc) & + mutate(`deleterious_outliers` = (!ppc) & (`is higher than mean` == `is group high`)), ~ (.) ) @@ -537,14 +551,11 @@ merge_results = function(res_discovery, res_test, formula, .transcript, .abundan !!.transcript, !!.abundance, !!.sample, - mean, - # `.lower_1`, - # `.upper_1`, - `exposure rate`, - slope_1 = slope, + slope_before_outlier_filtering = slope, one_of(parse_formula(formula)) ) %>% + # Attach results of tests left_join( res_test %>% @@ -552,15 +563,16 @@ merge_results = function(res_discovery, res_test, formula, .transcript, .abundan select( S, G, - mean_2 = mean, - .lower_2 = `.lower`, - .upper_2 = `.upper`, - slope_2 = slope, - ppc, - one_of(c("generated quantities", "deleterious outliers")) + exposure_rate, multiplier, + .lower = `.lower`, + .upper = `.upper`, + slope_after_outlier_filtering = slope, + posterior_predictive_check_succeded = ppc, + one_of(c("generated quantities", "deleterious_outliers")) ), by = c("S", "G") ) %>% + suppressWarnings() %>% # format results format_results(formula, !!.transcript, !!.abundance, !!.sample, do_check_only_on_detrimental) @@ -579,24 +591,18 @@ format_results = function(.data, formula, .transcript, .abundance, .sample, do_c .data %>% # Check if new package is installed with different sintax - ifelse_pipe( - packageVersion("tidyr") >= "0.8.3.9000", - ~ .x %>% nest(`sample wise data` = c(-!!.transcript)), - ~ .x %>% - group_by(!!.transcript) %>% - nest(`sample wise data` = -!!.transcript) - ) %>% + nest(`sample_wise_data` = -!!.transcript) %>% # Add summary statistics - mutate(`ppc samples failed` = map_int(`sample wise data`, ~ .x %>% pull(ppc) %>% `!` %>% sum)) %>% + mutate(`ppc_samples_failed` = map_int(`sample_wise_data`, ~ .x %>% pull(posterior_predictive_check_succeded) %>% `!` %>% sum)) %>% # If deleterious detection add summary as well ifelse_pipe( do_check_only_on_detrimental, ~ .x %>% mutate( - `tot deleterious outliers` = - map_int(`sample wise data`, ~ .x %>% pull(`deleterious outliers`) %>% sum) + `tot_deleterious_outliers` = + map_int(`sample_wise_data`, ~ .x %>% pull(`deleterious_outliers`) %>% sum) ) ) } @@ -642,43 +648,6 @@ select_to_check_and_house_keeping = function(.data, .do_check, .significance, .t } } - -#' add_exposure_rate -#' -#' @keywords internal -#' -#' -#' @importFrom tidyr separate -#' -#' @param .data A data frame -#' @param fit A fit object -#' -#' @return A `tbl` -#' -#' @noRd -add_exposure_rate = function(.data, fit){ - - writeLines(sprintf("executing %s", "add_exposure_rate")) - - .data %>% - left_join( - fit %>% - summary("exposure_rate") %$% - summary %>% - as_tibble(rownames = ".variable") %>% - separate( - .variable, - c(".variable", "S"), - sep = "[\\[,\\]]", - extra = "drop" - ) %>% - mutate(S = S %>% as.integer) %>% - rename(`exposure rate` = mean) %>% - select(S, `exposure rate`), - by = "S" - ) -} - check_if_within_posterior = function(.data, my_df, .do_check, .abundance){ writeLines(sprintf("executing %s", "check_if_within_posterior")) @@ -728,6 +697,7 @@ fit_to_counts_rng = function(fit, adj_prob_theshold){ extra = "drop") %>% mutate(S = S %>% as.integer, G = G %>% as.integer) %>% select(-one_of(c("n_eff", "Rhat", "khat"))) %>% + suppressWarnings() %>% rename(`.lower` = (.) %>% ncol - 1, `.upper` = (.) %>% ncol) } @@ -760,7 +730,7 @@ fit_to_counts_rng = function(fit, adj_prob_theshold){ #' @return A `tbl` #' #' @noRd -fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores, how_many_to_check){ +fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores, how_many_to_check, exposure_rate){ writeLines(sprintf("executing %s", "fit_to_counts_rng_approximated")) @@ -775,7 +745,7 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste # %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>% # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par) - draws_exposure = fit %>% rstan::extract("exposure_rate") %>% .[[1]] + # draws_exposure = fit %>% rstan::extract("exposure_rate") %>% .[[1]] # %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>% # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par) @@ -787,10 +757,11 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste list( S,G, truncation_compensation), ~ { - i_supersampled = seq_len(length.out=sample(length(draws_mu[,..1, ..2]), how_many_posterior_draws, replace = TRUE )) + i_supersampled = sample(seq_len(length(draws_mu[,..1, ..2])), how_many_posterior_draws, replace = TRUE ) + draws = rnbinom( n = how_many_posterior_draws, - mu = exp(draws_mu[,..1, ..2][i_supersampled] + draws_exposure[,..1][i_supersampled]), + mu = exp(draws_mu[,..1, ..2][i_supersampled] + exposure_rate[..1]), size = 1/exp(draws_sigma[,..2][i_supersampled]) * ..3 ) draws %>% @@ -809,44 +780,6 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste mutate(.variable = "counts_rng") %>% unnest(CI) - # draws_mu = - # fit %>% extract("lambda_log_param") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>% - # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par) - # draws_sigma = - # fit %>% extract("sigma_raw") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>% - # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par) - # draws_exposure = - # fit %>% extract("exposure_rate") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>% - # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par) - # - # draws_mu %>% - # left_join(draws_sigma, by = c(".draw", "G")) %>% - # left_join(draws_exposure, by = c(".draw", "S")) %>% - # nest(data = -c(S, G)) %>% - # mutate( - # CI = map( - # data, - # ~ { - # .x_supersampled = .x %>% sample_n(how_many_posterior_draws, replace = TRUE) - # draws = rnbinom(n =how_many_posterior_draws, mu = exp(.x_supersampled$mu + .x_supersampled$exposure), size = 1/exp(.x_supersampled$sigma) * truncation_compensation ) - # draws %>% - # # Process quantile - # quantile(c(adj_prob_theshold, 1 - adj_prob_theshold)) %>% - # tibble::as_tibble(rownames="prop") %>% - # tidyr::spread(prop, value) %>% - # setNames(c(".lower", ".upper")) %>% - # # Add mean and sd - # dplyr::mutate(mean = mean(draws), sd = sd(draws)) - # } - # ) - # ) %>% - # select(-data) %>% - # unnest(CI) %>% - # - # # Adapt to old dataset - # mutate(.variable = "counts_rng") %>% - # mutate(S = as.integer(S), G = as.integer(G)) - } @@ -868,16 +801,15 @@ save_generated_quantities_in_case = function(.data, fit, save_generated_quantiti ) } -check_columns_exist = function(.data, .sample, .transcript, .abundance, .significance, .do_check){ +check_columns_exist = function(.data, .sample, .transcript, .abundance, .significance){ # Prepare column same enquo .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) .significance = enquo(.significance) - .do_check = enquo(.do_check) - columns = c(quo_name(.sample), quo_name(.transcript), quo_name(.abundance), quo_name(.significance), quo_name(.do_check)) + columns = c(quo_name(.sample), quo_name(.transcript), quo_name(.abundance), quo_name(.significance)) if((!columns %in% (.data %>% colnames)) %>% any) stop( sprintf( @@ -907,21 +839,14 @@ check_columns_exist = function(.data, .sample, .transcript, .abundance, .signifi #' @return A `tbl` #' #' @noRd -check_if_any_NA = function(.data, .sample, .transcript, .abundance, .significance, .do_check, formula_columns){ - - # Prepare column same enquo - .sample = enquo(.sample) - .transcript = enquo(.transcript) - .abundance = enquo(.abundance) - .significance = enquo(.significance) - .do_check = enquo(.do_check) +#' +check_if_any_NA = function(.data, columns){ - columns = c(quo_name(.sample), quo_name(.transcript), quo_name(.abundance), quo_name(.significance), quo_name(.do_check), formula_columns) if( .data %>% drop_na(columns) %>% - nrow %>% st( .data %>% nrow ) + nrow %>% st( .data %>% nrow ) ) stop(sprintf("There are NA values in you tibble for any of the column %s", paste(columns, collapse=", "))) } @@ -1051,7 +976,6 @@ run_model = function(model, approximate_posterior_inference, chains, how_many_po tol_rel_obj = tol_rel_obj, pars = c( "counts_rng", - "exposure_rate", additional_parameters_to_save ) #, @@ -1071,7 +995,6 @@ run_model = function(model, approximate_posterior_inference, chains, how_many_po init = inits_fx, pars = c( "counts_rng", - "exposure_rate", additional_parameters_to_save ) ) @@ -1162,10 +1085,10 @@ identify_outliers_1_step = function(.data, #factor_of_interest = ifelse(parse_formula(formula) %>% length %>% `>` (0), parse_formula(formula)[1], "") # Check if columns exist - check_columns_exist(.data, !!.sample, !!.transcript, !!.abundance, !!.significance, !!.do_check) + check_columns_exist(.data, !!.sample, !!.transcript, !!.abundance, !!.significance) # Check if any column is NA or null - check_if_any_NA(.data, !!.sample, !!.transcript, !!.abundance, !!.significance, !!.do_check, parse_formula(formula)) + check_if_any_NA(.data, c(quo_name(.sample), quo_name(.transcript), quo_name(.abundance), quo_name(.significance), quo_name(.do_check), parse_formula(formula))) # Check is testing environment is supported if (approximate_posterior_inference & save_generated_quantities) @@ -1314,8 +1237,8 @@ identify_outliers_1_step = function(.data, `.lower`, `.upper`, ppc, - one_of(c("generated quantities", "deleterious outliers")), - `exposure rate`, + one_of(c("generated quantities", "deleterious_outliers")), + exposure_rate, one_of(parse_formula(formula)) ) %>% @@ -1408,8 +1331,7 @@ do_inference = function(my_df, X, lambda_mu_mu, cores, - exposure_rate_multiplier, - intercept_shift_scale, + sample_scaling, additional_parameters_to_save, adj_prob_theshold, how_many_posterior_draws, @@ -1435,7 +1357,7 @@ do_inference = function(my_df, .do_check = enquo(.do_check) # Check that the dataset is squared - if(my_df %>% distinct(!!.sample, !!.transcript) %>% count(!!.transcript) %>% count(n) %>% nrow %>% gt(1)) + if(my_df %>% distinct(!!.sample, !!.transcript) %>% count(!!.transcript) %>% count(n, name = "nn") %>% nrow %>% gt(1)) stop("The input data frame does not represent a rectangular structure. Each transcript must be present in all samples.") # Get the number of transcripts to check @@ -1472,7 +1394,7 @@ do_inference = function(my_df, # Setup dimensions of variables for the model G = counts_MPI %>% distinct(G) %>% nrow() S = counts_MPI %>% distinct(!!.sample) %>% nrow() - N = counts_MPI %>% distinct(idx_MPI,!!.abundance, `read count MPI row`) %>% count(idx_MPI) %>% summarise(max(n)) %>% pull(1) + N = counts_MPI %>% distinct(idx_MPI,!!.abundance, `read_count_MPI_row`) %>% count(idx_MPI) %>% summarise(max(n)) %>% pull(1) M = counts_MPI %>% distinct(`start`, idx_MPI) %>% count(idx_MPI) %>% pull(n) %>% max G_per_shard = counts_MPI %>% distinct(!!.transcript, idx_MPI) %>% count(idx_MPI) %>% pull(n) %>% as.array n_shards = min(shards, counts_MPI %>% distinct(idx_MPI) %>% nrow) @@ -1484,18 +1406,18 @@ do_inference = function(my_df, # Read count object counts = counts_MPI %>% - distinct(idx_MPI,!!.abundance, `read count MPI row`) %>% + distinct(idx_MPI,!!.abundance, `read_count_MPI_row`) %>% spread(idx_MPI,!!.abundance) %>% - select(-`read count MPI row`) %>% + select(-`read_count_MPI_row`) %>% replace(is.na(.), 0 %>% as.integer) %>% as_matrix() %>% t # Indexes of the samples sample_idx = counts_MPI %>% - distinct(idx_MPI, S, `read count MPI row`) %>% + distinct(idx_MPI, S, `read_count_MPI_row`) %>% spread(idx_MPI, S) %>% - select(-`read count MPI row`) %>% + select(-`read_count_MPI_row`) %>% replace(is.na(.), 0 %>% as.integer) %>% as_matrix() %>% t @@ -1541,6 +1463,15 @@ do_inference = function(my_df, # Dimension od the data package to pass to Stan CP = ncol(counts_package) + exposure_rate = + sample_scaling %>% + left_join( + counts_MPI %>% distinct(!!.sample, S), by=quo_name(.sample) + ) %>% + distinct(S, exposure_rate ) %>% + arrange(S) %>% + pull(exposure_rate) + # Run model #writeLines(sprintf("- Roughly the memory allocation for the fit object is %s Gb", object.size(1:(S * how_many_to_check * how_many_posterior_draws))/1e9)) @@ -1575,7 +1506,6 @@ do_inference = function(my_df, init = inits_fx, pars = c( "counts_rng", - "exposure_rate", "alpha_sub_1", additional_parameters_to_save ) @@ -1587,16 +1517,13 @@ do_inference = function(my_df, ifelse_pipe( approximate_posterior_analysis, - ~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores, how_many_to_check), + ~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores, how_many_to_check, exposure_rate), ~ .x %>% fit_to_counts_rng(adj_prob_theshold) ) %>% # If generated quantities are saved save_generated_quantities_in_case(fit, save_generated_quantities) %>% - # Add exposure rate - #add_exposure_rate(fit) %>% - # Check if data is within posterior check_if_within_posterior(my_df, .do_check, .abundance) %>% @@ -1607,12 +1534,9 @@ do_inference = function(my_df, add_deleterious_if_covariate_exists(X) %>% # Add position in MPI package for next inference - left_join(counts_MPI %>% distinct(S, G, idx_MPI, `read count MPI row`), + left_join(counts_MPI %>% distinct(S, G, idx_MPI, `read_count_MPI_row`), by = c("S", "G")) %>% - # Add exposure rate - add_exposure_rate(fit) %>% - # needed for the figure article ifelse_pipe(pass_fit, ~ .x %>% add_attr(fit, "fit") ) %>% diff --git a/README.Rmd b/README.Rmd index 210bc15..860dbdb 100644 --- a/README.Rmd +++ b/README.Rmd @@ -10,7 +10,8 @@ knitr::opts_chunk$set( fig.path = "man/figures/") The input data set is a tidy representation of a differential gene transcript abundance analysis ```{r echo=FALSE, include=FALSE} -library(tidyverse) +library(dplyr) +library(ggplot2) library(ppcseq) library(dplyr) library(magrittr) @@ -47,8 +48,8 @@ You can identify anrtefactual calls from your differential transcribt anundance counts.ppc = - counts %>% - mutate(is_significant = FDR < 0.01) %>% + counts |> + mutate(is_significant = FDR < 0.01) |> identify_outliers( formula = ~ Label, .sample = sample, @@ -72,12 +73,12 @@ We can visualise the top five differentially transcribed genes ```{r } counts.ppc_plots = - counts.ppc %>% + counts.ppc |> plot_credible_intervals() ``` ```{r} -counts.ppc_plots %>% - pull(plot) %>% - .[seq_len(length.out=2)] +counts.ppc_plots |> + pull(plot) |> + head(2) ``` diff --git a/README.md b/README.md index 50227fb..6817a90 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ writeLines(c( "CXX14FLAGS += -O3","CXX14FLAGS += -DSTAN_THREADS", "CXX14FLAGS += close(fileConn) ``` -Then, install with +Then, install with ``` r devtools::install_github("stemangiola/ppcseq") @@ -25,7 +25,8 @@ devtools::install_github("stemangiola/ppcseq") You can get the test dataset with ``` r -ppcseq::counts +data("counts") +counts ``` ## # A tibble: 394,821 x 9 @@ -51,8 +52,8 @@ anundance analysis, due to outliers. counts.ppc = - ppcseq::counts %>% - mutate(is_significant = FDR < 0.01) %>% + counts |> + mutate(is_significant = FDR < 0.01) |> identify_outliers( formula = ~ Label, .sample = sample, @@ -72,23 +73,23 @@ counts.ppc ``` ## # A tibble: 15 x 4 - ## symbol `sample wise data` `ppc samples failed` `tot deleterious outliers` - ## - ## 1 SLC16A12 0 0 - ## 2 CYP1A1 1 1 - ## 3 ART3 0 0 - ## 4 DIO2 0 0 - ## 5 OR51E2 0 0 - ## 6 MUC16 0 0 - ## 7 CCNA1 0 0 - ## 8 LYZ 1 1 - ## 9 PPM1H 0 0 - ## 10 SUSD5 0 0 - ## 11 TPRG1 0 0 - ## 12 EPB42 0 0 - ## 13 LRRC38 0 0 - ## 14 SUSD4 0 0 - ## 15 MMP8 0 0 + ## symbol sample_wise_data ppc_samples_failed tot_deleterious_outliers + ## + ## 1 SLC16A12 0 0 + ## 2 CYP1A1 1 1 + ## 3 ART3 0 0 + ## 4 DIO2 0 0 + ## 5 OR51E2 0 0 + ## 6 MUC16 0 0 + ## 7 CCNA1 0 0 + ## 8 LYZ 1 1 + ## 9 PPM1H 0 0 + ## 10 SUSD5 0 0 + ## 11 TPRG1 0 0 + ## 12 EPB42 0 0 + ## 13 LRRC38 0 0 + ## 14 SUSD4 0 0 + ## 15 MMP8 0 0 The new data frame contains plots for each gene @@ -96,14 +97,14 @@ We can visualise the top five differentially transcribed genes ``` r counts.ppc_plots = - counts.ppc %>% + counts.ppc |> plot_credible_intervals() ``` ``` r -counts.ppc_plots %>% - pull(plot) %>% - .[1:2] +counts.ppc_plots |> + pull(plot) |> + head(2) ``` ## [[1]] diff --git a/inst/stan/negBinomial_MPI.stan b/inst/stan/negBinomial_MPI.stan index e412161..f54e293 100644 --- a/inst/stan/negBinomial_MPI.stan +++ b/inst/stan/negBinomial_MPI.stan @@ -165,8 +165,7 @@ data { int how_many_to_check; // For better adaptation - real exposure_rate_multiplier; - real intercept_shift_scale[2]; + vector[S] exposure_rate; // Truncation real truncation_compensation; @@ -185,8 +184,6 @@ parameters { real lambda_sigma; real lambda_skew; - vector[S] exposure_rate; - // Gene-wise properties of the data row_vector[G] intercept; row_vector[how_many_to_check] alpha_sub_1; @@ -225,10 +222,6 @@ model { sigma_raw ~ normal(sigma_slope * intercept + sigma_intercept,sigma_sigma); - // Exposure prior - exposure_rate ~ normal(0,1); - sum(exposure_rate) ~ normal(0, 0.001 * S); - //Gene-wise properties of the data target += sum(map_rect( lp_reduce , diff --git a/man/add_scaled_counts_bulk.calcNormFactor.Rd b/man/add_scaled_counts_bulk.calcNormFactor.Rd deleted file mode 100644 index e3f6318..0000000 --- a/man/add_scaled_counts_bulk.calcNormFactor.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tidybulk.R -\name{add_scaled_counts_bulk.calcNormFactor} -\alias{add_scaled_counts_bulk.calcNormFactor} -\title{Calculate the norm factor with calcNormFactor from limma} -\usage{ -add_scaled_counts_bulk.calcNormFactor( - .data, - reference = NULL, - .sample = sample, - .transcript = transcript, - .abundance = count, - method -) -} -\arguments{ -\item{.data}{A tibble} - -\item{reference}{A reference matrix, not sure if used anymore} - -\item{.sample}{The name of the sample column} - -\item{.transcript}{The name of the transcript/gene column} - -\item{.abundance}{The name of the transcript/gene abundance column} - -\item{method}{A string character. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")} -} -\value{ -A list including the filtered data frame and the normalization factors -} -\description{ -Calculate the norm factor with calcNormFactor from limma -} -\keyword{internal} diff --git a/man/add_scaled_counts_bulk.get_low_expressed.Rd b/man/add_scaled_counts_bulk.get_low_expressed.Rd deleted file mode 100644 index 41e3012..0000000 --- a/man/add_scaled_counts_bulk.get_low_expressed.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tidybulk.R -\name{add_scaled_counts_bulk.get_low_expressed} -\alias{add_scaled_counts_bulk.get_low_expressed} -\title{Drop lowly transcribed genes for TMM normalization} -\usage{ -add_scaled_counts_bulk.get_low_expressed( - .data, - .sample = sample, - .transcript = transcript, - .abundance = count, - factor_of_interest = NULL, - minimum_counts = 10, - minimum_proportion = 0.7 -) -} -\arguments{ -\item{.data}{A tibble} - -\item{.sample}{The name of the sample column} - -\item{.transcript}{The name of the transcript/gene column} - -\item{.abundance}{The name of the transcript/gene abundance column} - -\item{factor_of_interest}{The name of the column of the factor of interest} - -\item{minimum_counts}{A positive integer. Minimum counts required for at least some samples.} - -\item{minimum_proportion}{A real positive number between 0 and 1. It is the threshold of proportion of samples for each transcripts/genes that have to be characterised by a cmp bigger than the threshold to be included for scaling procedure.} -} -\value{ -A tibble filtered -} -\description{ -Drop lowly transcribed genes for TMM normalization -} -\keyword{internal} diff --git a/man/figures/unnamed-chunk-9-1.png b/man/figures/unnamed-chunk-9-1.png index 7360caf..71e5d5e 100644 Binary files a/man/figures/unnamed-chunk-9-1.png and b/man/figures/unnamed-chunk-9-1.png differ diff --git a/man/figures/unnamed-chunk-9-2.png b/man/figures/unnamed-chunk-9-2.png index 72f4c3e..5169cae 100644 Binary files a/man/figures/unnamed-chunk-9-2.png and b/man/figures/unnamed-chunk-9-2.png differ diff --git a/man/get_scaled_counts_bulk.Rd b/man/get_scaled_counts_bulk.Rd deleted file mode 100644 index 1ca6abb..0000000 --- a/man/get_scaled_counts_bulk.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tidybulk.R -\name{get_scaled_counts_bulk} -\alias{get_scaled_counts_bulk} -\title{Get a tibble with scaled counts using TMM} -\usage{ -get_scaled_counts_bulk( - .data, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - method = "TMM", - reference_sample = NULL -) -} -\arguments{ -\item{.data}{A tibble} - -\item{.sample}{The name of the sample column} - -\item{.transcript}{The name of the transcript/gene column} - -\item{.abundance}{The name of the transcript/gene abundance column} - -\item{method}{A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")} - -\item{reference_sample}{A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference.} -} -\value{ -A tibble including additional columns -} -\description{ -Get a tibble with scaled counts using TMM -} -\keyword{internal} diff --git a/man/identify_outliers.Rd b/man/identify_outliers.Rd index 2b9a860..bc136b8 100644 --- a/man/identify_outliers.Rd +++ b/man/identify_outliers.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ppcseq.R +% Please edit documentation in R/methods.R \name{identify_outliers} \alias{identify_outliers} \title{identify_outliers main} @@ -12,6 +12,7 @@ identify_outliers( .abundance, .significance, .do_check, + .scaling_factor = NULL, percent_false_positive_genes = 1, how_many_negative_controls = 500, approximate_posterior_inference = TRUE, @@ -43,6 +44,8 @@ identify_outliers( \item{.do_check}{A column name as symbol. A column with a boolean indicating whether a transcript was identified as differentially abundant} +\item{.scaling_factor}{In case the scaling factor must not be calculated (TMM method) using the input data but provided. It is useful, for example, for pseudobulk single-cell where the scaling might depend on sample sequencing depth for all cells rather than a particular cell type.} + \item{percent_false_positive_genes}{A real between 0 and 100. It is the aimed percent of transcript being a false positive. For example, percent_false_positive_genes = 1 provide 1 percent of the calls for outlier containing transcripts that has actually not outliers.} \item{how_many_negative_controls}{An integer. How many transcript from the bottom non-significant should be taken for inferring the mean-overdispersion trend.} @@ -72,7 +75,7 @@ identify_outliers( \item{adj_prob_theshold_2}{A boolean. Used for development and testing purposes} } \value{ -A nested tibble \code{tbl} with transcript-wise information: \verb{sample wise data} | plot | \verb{ppc samples failed} | \verb{tot deleterious outliers} +A nested tibble \code{tbl} with transcript-wise information: \code{sample_wise_data} | plot | \verb{ppc samples failed} | \verb{tot deleterious_outliers} } \description{ This function runs the data modeling and statistical test for the hypothesis that a transcript includes outlier biological replicate. diff --git a/man/plot_credible_intervals.Rd b/man/plot_credible_intervals.Rd index 2dcf1eb..7f0d5a3 100644 --- a/man/plot_credible_intervals.Rd +++ b/man/plot_credible_intervals.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ppcseq.R +% Please edit documentation in R/methods.R \name{plot_credible_intervals} \alias{plot_credible_intervals} \title{plot_credible interval for theoretical data distributions} diff --git a/tests/testthat/test-ppcSeq.R b/tests/testthat/test-ppcSeq.R index 8721f95..4cf0a5f 100644 --- a/tests/testthat/test-ppcSeq.R +++ b/tests/testthat/test-ppcSeq.R @@ -7,8 +7,8 @@ if(Sys.info()[['sysname']] == "Linux"){ test_that("VB post approx no correction",{ res = - counts %>% - dplyr::mutate( is_significant = ifelse(symbol %in% c("SLC16A12", "CYP1A1", "ART3"), TRUE, FALSE) ) %>% + counts |> + dplyr::mutate( is_significant = ifelse(symbol %in% c("SLC16A12", "CYP1A1", "ART3"), TRUE, FALSE) ) |> ppcseq::identify_outliers( formula = ~ Label, sample, symbol, value, @@ -19,7 +19,8 @@ test_that("VB post approx no correction",{ approximate_posterior_inference =TRUE, approximate_posterior_analysis =TRUE, how_many_negative_controls = 50, - cores=1,pass_fit = TRUE + cores=1, + pass_fit = TRUE ) expect_equal(