Initial addition of fasta-stats

ashvark · Nov 21, 2018 · 1f0e7a2 · 1f0e7a2
1 parent 39701b6
commit 1f0e7a2
Show file tree

Hide file tree

Showing 5 changed files with 2,953 additions and 0 deletions.
diff --git a/tools/fasta_stats/.shed.yml b/tools/fasta_stats/.shed.yml
@@ -0,0 +1,7 @@
+categories:
+    - Sequence Analysis
+description: Display summary statistics for a fasta file.
+homepage_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/
+name: fasta_stats
+owner: iuc
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/
diff --git a/tools/fasta_stats/fasta-stats.pl b/tools/fasta_stats/fasta-stats.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/env perl
+
+# fasta-stats
+# written by torsten.seemann@monash.edu 
+# oct 2012
+
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+
+# stat storage
+
+my $n=0;
+my $seq = '';
+my %stat;
+my @len;
+
+# MAIN LOOP collecting sequences
+
+while (my $line = <ARGV>) {
+  chomp $line;
+  if ($line =~ m/^\s*>/) {
+    process($seq) if $n;
+    $n++;
+    $seq='';
+  }
+  else {
+    $seq .= $line;
+  }  
+}
+
+process($seq) if $n;
+
+# sort length array 
+# (should use hash here for efficiency with huge no of short reads?)
+
+@len = sort { $a <=> $b } @len;
+
+# compute more stats
+
+$stat{'num_seq'} = scalar(@len);
+
+if (@len) {
+  $stat{'num_bp'} = sum(@len);
+  $stat{'len_min'} = $len[0];
+  $stat{'len_max'} = $len[-1];
+  $stat{'len_median'} = $len[int(@len/2)];
+  $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); 
+  # calculate n50
+
+  $stat{'len_N50'} = 0;
+  my $cum=0;
+  my $thresh = int 0.5 * $stat{'num_bp'};
+  for my $i (0 .. $#len) {
+    $cum += $len[$i];
+    if ($cum >= $thresh) {
+      $stat{'len_N50'} = $len[$i];
+      last;
+    }
+  }
+}
+
+#calculate GC content
+$stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'};
+$stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100;
+
+# print stats as .tsv
+
+for my $name (sort keys %stat) {
+    if ($name =~ m/GC_content/){
+        printf "%s\t%0.1f\n", $name, $stat{$name};
+    } else {
+        printf "%s\t%s\n", $name, $stat{$name};
+    }
+}
+
+# run for each sequence
+
+sub process {
+  my($s) = @_;
+  # base composition
+  for my $x (qw(A G T C N)) {
+    my $count = $s =~ s/$x/$x/gi;
+    $stat{"num_$x"} += $count;
+  }
+  # keep list of all lengths encountered
+  push @len, length($s);    
+}
+
diff --git a/tools/fasta_stats/fasta-stats.xml b/tools/fasta_stats/fasta-stats.xml
@@ -0,0 +1,59 @@
+<tool id="fasta-stats" name="Fasta Statistics" version="1.0.1">
+    <description>Display summary statistics for a fasta file.</description>
+    <requirements>
+        <requirement type="package" version="5.26">perl</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        perl '${__tool_directory__}/fasta-stats.pl'
+        '$dataset'
+        > '$stats'
+        ]]>
+    </command>
+    <inputs>
+        <param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/>
+    </inputs>
+    <outputs>
+        <data name="stats" format="tabular" label="${tool.name} on ${on_string}: Fasta summary stats"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="dataset" value="test.fasta"/>
+            <output name="stats" file="test_out.txt"/>
+        </test>
+    </tests>
+    <help>
+**Fasta Stats**
+Displays the summary statistics for a fasta file.
+
+Written by Torsten Seemann - Victorian Bioinformatics Consortium
+
+Wrapped by Simon Gladman - Victorian Bioinformatics Consortium
+
+------
+
+Outputs in tabular form:
+    Lengths: n50, min, max, median and average
+
+    Number of base pairs: A, C, G, T, N, Total and Total_not_N
+
+    Number of sequences
+
+    GC content in %
+
+------
+
+Inputs:
+
+Fasta dataset
+    </help>
+    <citations>
+        <citation type="bibtex">
+@UNPUBLISHED{Seemann_Gladman2012,
+    author = {Torsten Seemann and Simon Gladman},
+    title = {Fasta Statistics: Display summary statistics for a fasta file.},
+    year = {2012},
+    url = {https://github.com/galaxyproject/tools-iuc},
+}
+        </citation>
+    </citations>
+</tool>