man/run_sl.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DA-sl.R
\name{run_sl}
\alias{run_sl}
\title{Identify biomarkers using supervised leaning (SL) methods}
\usage{
run_sl(
  ps,
  group,
  taxa_rank = "all",
  transform = c("identity", "log10", "log10p"),
  norm = "none",
  norm_para = list(),
  nfolds = 3,
  nrepeats = 3,
  sampling = NULL,
  tune_length = 5,
  top_n = 10,
  method = c("LR", "RF", "SVM"),
  ...
)
}
\arguments{
\item{ps}{a \code{\link[phyloseq]{phyloseq-class}} object.}

\item{group}{character, the variable to set the group.}

\item{taxa_rank}{character to specify taxonomic rank to perform
differential analysis on. Should be one of
\code{phyloseq::rank_names(phyloseq)}, or "all" means to summarize the taxa by
the top taxa ranks (\code{summarize_taxa(ps, level = rank_names(ps)[1])}), or
"none" means perform differential analysis on the original taxa
(\code{taxa_names(phyloseq)}, e.g., OTU or ASV).}

\item{transform}{character, the methods used to transform the microbial
abundance. See \code{\link[=transform_abundances]{transform_abundances()}} for more details. The
options include:
\itemize{
\item "identity", return the original data without any transformation
(default).
\item "log10", the transformation is \code{log10(object)}, and if the data contains
zeros the transformation is \code{log10(1 + object)}.
\item "log10p", the transformation is \code{log10(1 + object)}.
}}

\item{norm}{the methods used to normalize the microbial abundance data. See
\code{\link[=normalize]{normalize()}} for more details.
Options include:
\itemize{
\item "none": do not normalize.
\item "rarefy": random subsampling counts to the smallest library size in the
data set.
\item "TSS": total sum scaling, also referred to as "relative abundance", the
abundances were normalized by dividing the corresponding sample library
size.
\item "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
The scaling factor is then derived using a weighted trimmed mean over the
differences of the log-transformed gene-count fold-change between the
sample and the reference.
\item "RLE", relative log expression, RLE uses a pseudo-reference calculated
using the geometric mean of the gene-specific abundances over all
samples. The scaling factors are then calculated as the median of the
gene counts ratios between the samples and the reference.
\item "CSS": cumulative sum scaling, calculates scaling factors as the
cumulative sum of gene abundances up to a data-derived threshold.
\item "CLR": centered log-ratio normalization.
\item "CPM": pre-sample normalization of the sum of the values to 1e+06.
}}

\item{norm_para}{named \code{list}. other arguments passed to specific
normalization methods.  Most users will not need to pass any additional
arguments here.}

\item{nfolds}{the number of splits in CV.}

\item{nrepeats}{the number of complete sets of folds to compute.}

\item{sampling}{a single character value describing the type of additional
sampling that is conducted after resampling (usually to resolve class
imbalances). Values are "none", "down", "up", "smote", or "rose". For
more details see \code{\link[caret:trainControl]{caret::trainControl()}}.}

\item{tune_length}{an integer denoting the amount of granularity in the
tuning parameter grid. For more details see \code{\link[caret:train]{caret::train()}}.}

\item{top_n}{an integer denoting the top \code{n} features as the biomarker
according the importance score.}

\item{method}{supervised learning method, options are "LR" (logistic
regression), "RF" (rando forest), or "SVM" (support vector machine).}

\item{...}{extra arguments passed to the classification. e.g., \code{importance}
for \code{randomForest::randomForest}.}
}
\value{
a \linkS4class{microbiomeMarker} object.
}
\description{
Identify biomarkers using logistic regression, random forest, or support
vector machine.
}
\details{
Only support two groups comparison in the current version. And the
marker was selected based on its importance score. Moreover, The
hyper-parameters are selected automatically by a grid-search based method
in the N-time K-fold cross-validation. Thus, the identified biomarker based
can be biased due to model overfitting for small datasets (e.g., with less
than 100 samples).

The argument \code{top_n} is used to denote the number of markers based on the
importance score. There is no rule or principle on how to select \code{top_n},
however, usually it is very useful to try a different \code{top_n} and compare
the performance of the marker predictions for the testing data.
}
\examples{
data(enterotypes_arumugam)
# small example phyloseq object for test
ps_small <- phyloseq::subset_taxa(
    enterotypes_arumugam,
    Phylum \%in\% c("Firmicutes", "Bacteroidetes")
)

set.seed(2021)
mm <- run_sl(
    ps_small,
    group = "Gender",
    taxa_rank = "Genus",
    nfolds = 2,
    nrepeats = 1,
    top_n = 15,
    norm = "TSS",
    method = "LR",
)
mm
}
\seealso{
\code{\link[caret:train]{caret::train()}},\code{\link[caret:trainControl]{caret::trainControl()}}
}
\author{
Yang Cao
}