-
Notifications
You must be signed in to change notification settings - Fork 27
/
diamond_rec.Rd
139 lines (107 loc) · 6.8 KB
/
diamond_rec.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/diamond_rec.R
\name{diamond_rec}
\alias{diamond_rec}
\title{Perform a DIAMOND2 reciprocal best hit (RBH) search}
\usage{
diamond_rec(
query_file,
subject_file,
seq_type = "cds",
format = "fasta",
diamond_algorithm = "blastp",
sensitivity_mode = "fast",
delete_corrupt_cds = TRUE,
eval = "1E-5",
max.target.seqs = 10000,
path = NULL,
comp_cores = 1,
diamond_params = NULL,
clean_folders = FALSE,
save.output = NULL
)
}
\arguments{
\item{query_file}{a character string specifying the path to the CDS file of interest (query organism).}
\item{subject_file}{a character string specifying the path to the CDS file of interest (subject organism).}
\item{seq_type}{a character string specifying the sequence type stored in the input file.
Options are are: "cds", "protein", or "dna". In case of "cds", sequence are translated to protein sequences,
in case of "dna", cds prediction is performed on the corresponding sequences which subsequently are
translated to protein sequences. Default is \code{seq_type} = "cds".}
\item{format}{a character string specifying the file format of the sequence file, e.g. \code{format} = \code{"fasta"}.
Default is \code{format} = \code{"fasta"}.}
\item{diamond_algorithm}{a character string specifying the DIAMOND2 algorithm that shall be used, option is currently limited to: \code{diamond_algorithm} = \code{"blastp"}}
\item{sensitivity_mode}{specify the level of alignment sensitivity. The higher the sensitivity level, the more deep homologs can be found, but at the cost of reduced computational speed.
- sensitivity_mode = "faster" : fastest alignment mode, but least sensitive (default). Designed for finding hits of >70
- sensitivity_mode = "default" : Default mode. Designed for finding hits of >70
- sensitivity_mode = "fast" : fast alignment mode, but least sensitive (default). Designed for finding hits of >70
- sensitivity_mode = "mid-sensitive" : fast alignments between the fast mode and the sensitive mode in sensitivity.
- sensitivity_mode = "sensitive" : fast alignments, but full sensitivity for hits >40
- sensitivity_mode = "more-sensitive" : more sensitive than the sensitive mode.
- sensitivity_mode = "very-sensitive" : sensitive alignment mode.
- sensitivity_mode = "ultra-sensitive" : most sensitive alignment mode (sensitivity as high as BLASTP).}
\item{delete_corrupt_cds}{a logical value indicating whether sequences with corrupt base triplets should be removed from the input \code{file}. This is the case when the length of coding sequences cannot be divided by 3 and thus the coding sequence contains at least one corrupt base triplet.}
\item{eval}{a numeric value specifying the E-Value cutoff for DIAMOND2 hit detection.}
\item{max.target.seqs}{a numeric value specifying the number of aligned sequences to keep.
Please be aware that \code{max.target.seqs} selects best hits based on the database entry and not by the best e-value. See details here: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/bty833/5106166 .}
\item{path}{a character string specifying the path to the DIAMOND2 program (in case you don't use the default path).}
\item{comp_cores}{a numeric value specifying the number of cores to be used for multicore DIAMOND2 computations.}
\item{diamond_params}{a character string listing the input paramters that shall be passed to the executing DIAMOND2 program. Default is \code{NULL}, implicating
that a set of default parameters is used when running DIAMOND2.}
\item{clean_folders}{a boolean value spefiying whether all internall folders storing the output of used programs
shall be removed. Default is \code{clean_folders} = \code{FALSE}.}
\item{save.output}{a path to the location were the DIAMOND2 output shall be stored. E.g. \code{save.output} = \code{getwd()}
to store it in the current working directory, or \code{save.output} = \code{file.path(put,your,path,here)}.}
}
\value{
A data.table as returned by the \code{\link{diamond}} function, storing the geneids
of orthologous genes (reciprocal best hit) in the first column and the amino acid sequences in the second column.
}
\description{
This function performs a DIAMOND2 search (reciprocal best hit) of a given set of protein sequences against a second
set of protein sequences and vice versa.
}
\details{
Given a set of protein sequences A and a different set of protein sequences B,
first a best hit diamond search is being performed from A to B: diamond(A,B) and afterwards
a best hit diamond search is being performed from B to A: diamond(B,A). Only protein sequences
that were found to be best hits in both directions are retained and returned.
This function gives the same output as \code{\link{blast_rec}} while being much much faster.
This function can be used to perform orthology inference using DIAMOND2 best reciprocal hit methodology.
}
\examples{
\dontrun{
# performing gene orthology inference using the reciprocal best hit (RBH) method
diamond_rec(query_file = system.file('seqs/ortho_thal_cds.fasta', package = 'orthologr'),
subject_file = system.file('seqs/ortho_lyra_cds.fasta', package = 'orthologr'))
# performing gene orthology inference using the reciprocal best hit (RBH) method
# starting with protein sequences
diamond_rec(query_file = system.file('seqs/ortho_thal_aa.fasta', package = 'orthologr'),
subject_file = system.file('seqs/ortho_lyra_aa.fasta', package = 'orthologr'),
seq_type = "protein")
# save the DIAMOND2 output file to the current working directory
diamond_rec(query_file = system.file('seqs/ortho_thal_aa.fasta', package = 'orthologr'),
subject_file = system.file('seqs/ortho_lyra_aa.fasta', package = 'orthologr'),
seq_type = "protein",
save.output = getwd())
# use multicore processing
diamond_rec(query_file = system.file('seqs/ortho_thal_cds.fasta', package = 'orthologr'),
subject_file = system.file('seqs/ortho_lyra_cds.fasta', package = 'orthologr'),
comp_cores = 2)
# performing gene orthology inference using the reciprocal best hit (RBH) method
# and external path to diamond
diamond_rec(query_file = system.file('seqs/ortho_thal_cds.fasta', package = 'orthologr'),
subject_file = system.file('seqs/ortho_lyra_cds.fasta', package = 'orthologr'),
path = "path/to/diamond")
}
}
\references{
Buchfink, B., Reuter, K., & Drost, H. G. (2021) "Sensitive protein alignments at tree-of-life scale using DIAMOND." Nature methods, 18(4), 366-368.
https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options
}
\seealso{
\code{\link{diamond}}, \code{\link{diamond_best}}, \code{\link{set_diamond}}, \code{\link{blast_rec}}
}
\author{
Jaruwatana Sodai Lotharukpong
}