Update FreeBayes to version 1.3.1

ashvark · Aug 2, 2019 · 7c91103 · 7c91103
1 parent c640a89
commit 7c91103
Show file tree

Hide file tree

Showing 6 changed files with 216 additions and 8 deletions.
diff --git a/tools/freebayes/freebayes.xml b/tools/freebayes/freebayes.xml
@@ -83,6 +83,8 @@
         #elif str( $options_type.options_type_selector ) == "simple_w_filters":
             --standard-filters
             --min-coverage ${options_type.min_coverage}
+            --skip-coverage ${options_type.skip_coverage}
+            --limit-coverage ${options_type.limit_coverage}
         #elif str( $options_type.options_type_selector ) == "naive":
             --haplotype-length 0
             --min-alternate-count 1
@@ -97,6 +99,8 @@
             --report-monomorphic
             --standard-filters
             --min-coverage ${options_type.min_coverage}
+            --skip-coverage ${options_type.skip_coverage}
+            --limit-coverage ${options_type.limit_coverage}
         #elif str( $options_type.options_type_selector ) == "full":
             #if str( $options_type.optional_inputs.optional_inputs_selector ) == 'set':
                 ${options_type.optional_inputs.report_monomorphic}
@@ -645,7 +649,7 @@
             <output name="output_vcf" file="freebayes-phix174-test4.vcf" lines_diff="4" />
         </test>
     </tests>
-    <help>
+    <help><![CDATA[
 **What it does**
 
 FreeBayes is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs (single-nucleotide polymorphisms), indels (insertions and deletions), MNPs (multi-nucleotide polymorphisms), and complex events (composite insertion and substitution events) smaller than the length of a short-read sequencing alignment.
@@ -680,10 +684,212 @@ Galaxy allows five levels of control over FreeBayes options, provided by the **C
 
 ------
 
+**Command-line parameters**
+
+**Input**::
+
+    --bam FILE                          The file or set of BAM files to be analyzed.
+    --bam-list FILE                     A file containing a list of BAM files to be analyzed.
+
+    --stdin                             Read BAM input on stdin.
+    --fasta-reference FILE              Use FILE as the reference sequence for analysis.
+                                        An index file (FILE.fai) will be created if none exists.
+                                        If neither --targets nor --region are specified, FreeBayes
+                                        will analyze every position in this reference.
+    --targets FILE                      Limit analysis to targets listed in the BED-format FILE.
+    --region <chrom>:<start>-<end>      Limit analysis to the specified region, 0-base coordinates,
+                                        end_position not included (same as BED format).
+                                        Either '-' or '..' maybe used as a separator.
+    --samples FILE                      Limit analysis to samples listed (one per line) in the FILE.
+                                        By default FreeBayes will analyze all samples in its input
+                                        BAM files.
+    --populations FILE                  Each line of FILE should list a sample and a population which
+                                        it is part of.  The population-based bayesian inference model
+                                        will then be partitioned on the basis of the populations.
+    --cnv-map FILE                      Read a copy number map from the BED file FILE, which has
+                                        either a sample-level ploidy:
+                                        sample_name copy_number
+                                        or a region-specific format:
+                                        seq_name start end sample_name copy_number
+                                        ... for each region in each sample which does not have the
+                                        default copy number as set by --ploidy. These fields can be delimited
+                                        by space or tab.
+
+**Output**::
+
+    --vcf FILE                          Output VCF-format results to FILE. (default: stdout)
+    --gvcf                              Write gVCF output, which indicates coverage in uncalled regions.
+    --gvcf-chunk NUM                    When writing gVCF output emit a record for every NUM bases.
+    --gvcf-dont-use-chunk               When writing the gVCF output emit a record for all bases if
+                                        set to "true" , will also route an int to --gvcf-chunk
+                                        similar to --output-mode EMIT_ALL_SITES from GATK
+    --variant-input VCF                 Use variants reported in VCF file as input to the algorithm.
+                                        Variants in this file will included in the output even if
+                                        there is not enough support in the data to pass input filters.
+    --only-use-input-alleles            Only provide variant calls and genotype likelihoods for sites
+                                        and alleles which are provided in the VCF input, and provide
+                                        output in the VCF for all input alleles, not just those which
+                                        have support in the data.
+    --haplotype-basis-alleles VCF       When specified, only variant alleles provided in this input
+                                        VCF will be used for the construction of complex or haplotype
+                                        alleles.
+    --report-all-haplotype-alleles      At sites where genotypes are made over haplotype alleles,
+                                        provide information about all alleles in output, not only
+                                        those which are called.
+    --report-monomorphic                Report even loci which appear to be monomorphic, and report all
+                                        considered alleles, even those which are not in called genotypes.
+                                        Loci which do not have any potential alternates have '.' for ALT.
+    --pvar N                            Report sites if the probability that there is a polymorphism
+                                        at the site is greater than N.  default: 0.0.  Note that post-
+                                        filtering is generally recommended over the use of this parameter.
+    --strict-vcf                        Generate strict VCF format (FORMAT/GQ will be an int)
+
+**Population model**::
+
+    --theta N                           The expected mutation rate or pairwise nucleotide diversity
+                                        among the population under analysis.  This serves as the
+                                        single parameter to the Ewens Sampling Formula prior model
+                                        default: 0.001
+    --ploidy N                          Sets the default ploidy for the analysis to N.  default: 2
+    --pooled-discrete                   Assume that samples result from pooled sequencing.
+                                        Model pooled samples using discrete genotypes across pools.
+                                        When using this flag, set --ploidy to the number of
+                                        alleles in each sample or use the --cnv-map to define
+                                        per-sample ploidy.
+    --pooled-continuous                 Output all alleles which pass input filters, regardles of
+                                        genotyping outcome or model.
+
+**Reference allele**::
+
+    --use-reference-allele              This flag includes the reference allele in the analysis as
+                                        if it is another sample from the same population.
+    --reference-quality MQ,BQ           Assign mapping quality of MQ to the reference allele at each
+                                        site and base quality of BQ.  default: 100,60
+
+**Allele scope**::
+
+    --use-best-n-alleles N              Evaluate only the best N SNP alleles, ranked by sum of
+                                        supporting quality scores.  (Set to 0 to use all; default: all)
+    --max-complex-gap
+    --haplotype-length N                Allow haplotype calls with contiguous embedded matches of up
+                                        to this length. Set N=-1 to disable clumping. (default: 3)
+    --min-repeat-size                   When assembling observations across repeats, require the total repeat
+                                        length at least this many bp.  (default: 5)
+    --min-repeat-entropy N              To detect interrupted repeats, build across sequence until it has
+                                        entropy > N bits per bp. Set to 0 to turn off. (default: 1)
+    --no-partial-observations           Exclude observations which do not fully span the dynamically-determined
+                                        detection window.  (default, use all observations, dividing partial
+                                        support across matching haplotypes when generating haplotypes.)
+
+**Indel realignment**::
+
+    --dont-left-align-indels            Turn off left-alignment of indels, which is enabled by default.
+
+**Input filters**::
+
+    --use-duplicate-reads               Include duplicate-marked alignments in the analysis.
+                                        default: exclude duplicates marked as such in alignments
+    --min-mapping-quality Q             Exclude alignments from analysis if they have a mapping
+                                        quality less than Q.  default: 1
+    --min-base-quality Q                Exclude alleles from analysis if their supporting base
+                                        quality is less than Q.  default: 0
+    --min-supporting-allele-qsum Q      Consider any allele in which the sum of qualities of supporting
+                                        observations is at least Q.  default: 0
+    --min-supporting-mapping-qsum Q     Consider any allele in which and the sum of mapping qualities of
+                                        supporting reads is at least Q.  default: 0
+    --mismatch-base-quality-threshold Q Count mismatches toward --read-mismatch-limit if the base
+                                        quality of the mismatch is >= Q.  default: 10
+    --read-mismatch-limit N             Exclude reads with more than N mismatches where each mismatch
+                                        has base quality >= mismatch-base-quality-threshold.
+                                        default: ~unbounded
+    --read-max-mismatch-fraction N      Exclude reads with more than N [0,1] fraction of mismatches where
+                                        each mismatch has base quality >= mismatch-base-quality-threshold
+                                        default: 1.0
+    --read-snp-limit N                  Exclude reads with more than N base mismatches, ignoring gaps
+                                        with quality >= mismatch-base-quality-threshold.
+                                        default: ~unbounded
+    --read-indel-limit N                Exclude reads with more than N separate gaps.
+                                        default: ~unbounded
+    --standard-filters                  Use stringent input base and mapping quality filters
+                                        Equivalent to -m 30 -q 20 -R 0 -S 0
+    --min-alternate-fraction N          Require at least this fraction of observations supporting
+                                        an alternate allele within a single individual in the
+                                        in order to evaluate the position.  default: 0.05
+    --min-alternate-count N             Require at least this count of observations supporting
+                                        an alternate allele within a single individual in order
+                                        to evaluate the position.  default: 2
+    --min-alternate-qsum N              Require at least this sum of quality of observations supporting
+                                        an alternate allele within a single individual in order
+                                        to evaluate the position.  default: 0
+    --min-alternate-total N             Require at least this count of observations supporting
+                                        an alternate allele within the total population in order
+                                        to use the allele in analysis.  default: 1
+    --min-coverage N                    Require at least this coverage to process a site. default: 0
+    --limit-coverage N                  Downsample per-sample coverage to this level if greater than this coverage.
+                                        default: no limit
+    --skip-coverage N                   Skip processing of alignments overlapping positions with coverage >N.
+                                        This filters sites above this coverage, but will also reduce data nearby.
+                                        default: no limit
+
+**Population priors**::
+
+    --no-population-priors              Equivalent to --pooled-discrete --hwe-priors-off and removal of
+                                        Ewens Sampling Formula component of priors.
+
+**Mappability priors**::
+
+    --hwe-priors-off                    Disable estimation of the probability of the combination
+                                        arising under HWE given the allele frequency as estimated
+                                        by observation frequency.
+    --binomial-obs-priors-off           Disable incorporation of prior expectations about observations.
+                                        Uses read placement probability, strand balance probability,
+                                        and read position (5'-3') probability.
+    --allele-balance-priors-off         Disable use of aggregate probability of observation balance between alleles
+                                        as a component of the priors.
+
+**Genotype likelihoods**::
+
+    --observation-bias FILE             Read length-dependent allele observation biases from FILE.
+                                        The format is [length] [alignment efficiency relative to reference]
+                                        where the efficiency is 1 if there is no relative observation bias.
+    --base-quality-cap Q                Limit estimated observation quality by capping base quality at Q.
+    --prob-contamination F              An estimate of contamination to use for all samples.  default: 10e-9
+    --legacy-gls                        Use legacy (polybayes equivalent) genotype likelihood calculations
+    --contamination-estimates FILE      A file containing per-sample estimates of contamination, such as
+                                        those generated by VerifyBamID.  The format should be:
+                                        sample p(read=R|genotype=AR) p(read=A|genotype=AA)
+                                        Sample '*' can be used to set default contamination estimates.
+
+**Algorithmic features**::
+
+    --report-genotype-likelihood-max    Report genotypes using the maximum-likelihood estimate provided
+                                        from genotype likelihoods.
+    --genotyping-max-iterations N       Iterate no more than N times during genotyping step. default: 1000.
+    --genotyping-max-banddepth N        Integrate no deeper than the Nth best genotype by likelihood when
+                                        genotyping. default: 6.
+    --posterior-integration-limits N,M  Integrate all genotype combinations in our posterior space
+                                        which include no more than N samples with their Mth best
+                                        data likelihood. default: 1,3.
+    --exclude-unobserved-genotypes      Skip sample genotypings for which the sample has no supporting reads.
+    --genotype-variant-threshold N      Limit posterior integration to samples where the second-best
+                                        genotype likelihood is no more than log(N) from the highest
+                                        genotype likelihood for the sample.  default: ~unbounded
+    --use-mapping-quality               Use mapping quality of alleles when calculating data likelihoods.
+    --harmonic-indel-quality            Use a weighted sum of base qualities around an indel, scaled by the
+                                        distance from the indel.  By default use a minimum BQ in flanking sequence.
+    --read-dependence-factor N          Incorporate non-independence of reads by scaling successive
+                                        observations by this factor during data likelihood
+                                        calculations.  default: 0.9
+    --genotype-qualities                Calculate the marginal probability of genotypes and report as GQ in
+                                        each sample field in the VCF output.
+
+------
+
 **Acknowledgments**
 
 The initial version of the wrapper was produced by Dan Blankenberg and upgraded by Anton Nekrutenko.
 TNG was developed by Bjoern Gruening.
+]]>
     </help>
     <expand macro="citations">
         <citation type="bibtex">

diff --git a/tools/freebayes/macros.xml b/tools/freebayes/macros.xml
@@ -44,6 +44,8 @@
         </conditional>
     </xml>
     <xml name="par_min_cov">
-        <param name="min_coverage" argument="--coverage" type="integer" value="0" label="Require at least this coverage to process a site" />
+        <param name="min_coverage" argument="--min-coverage" type="integer" value="0" label="Require at least this coverage to process a site" />
+        <param name="limit_coverage" argument="--limit-coverage" type="integer" value="0" label="Downsample per-sample coverage to this level if greater than this coverage" />
+        <param name="skip_coverage" argument="--skip-coverage" type="integer" value="0" label="Skip processing of alignments overlapping positions with coverage greater than this" />
     </xml>
 </macros>
diff --git a/tools/freebayes/test-data/freebayes-phix174-test1.vcf b/tools/freebayes/test-data/freebayes-phix174-test1.vcf
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.2
-##fileDate=20190618
+##fileDate=20190709
 ##source=freeBayes v1.3.1-dirty
 ##reference=localref.fa
 ##contig=<ID=phiX174,length=5386>

diff --git a/tools/freebayes/test-data/freebayes-phix174-test2.vcf b/tools/freebayes/test-data/freebayes-phix174-test2.vcf
@@ -1,10 +1,10 @@
 ##fileformat=VCFv4.2
-##fileDate=20190618
+##fileDate=20190709
 ##source=freeBayes v1.3.1-dirty
 ##reference=localref.fa
 ##contig=<ID=phiX174,length=5386>
 ##phasing=none
-##commandline="freebayes --region phiX174:0..5386 --bam b_0.bam --fasta-reference localref.fa --vcf ./vcf_output/part_phiX174:0..5386.vcf --haplotype-length 0 --min-alternate-count 1 --min-alternate-fraction 0 --pooled-continuous --report-monomorphic --standard-filters --min-coverage 14"
+##commandline="freebayes --region phiX174:0..5386 --bam b_0.bam --fasta-reference localref.fa --vcf ./vcf_output/part_phiX174:0..5386.vcf --haplotype-length 0 --min-alternate-count 1 --min-alternate-fraction 0 --pooled-continuous --report-monomorphic --standard-filters --min-coverage 14 --skip-coverage 0 --limit-coverage 0"
 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
 ##INFO=<ID=DPB,Number=1,Type=Float,Description="Total read depth per bp at the locus; bases in reads overlapping / bases in haplotype">