Skip to content

Commit

Permalink
Initial addition of fasta-stats
Browse files Browse the repository at this point in the history
  • Loading branch information
Slugger70 committed Nov 21, 2018
1 parent 39701b6 commit 1f0e7a2
Show file tree
Hide file tree
Showing 5 changed files with 2,953 additions and 0 deletions.
7 changes: 7 additions & 0 deletions tools/fasta_stats/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
categories:
- Sequence Analysis
description: Display summary statistics for a fasta file.
homepage_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/
name: fasta_stats
owner: iuc
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/
89 changes: 89 additions & 0 deletions tools/fasta_stats/fasta-stats.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env perl

# fasta-stats
# written by torsten.seemann@monash.edu
# oct 2012

use strict;
use warnings;
use List::Util qw(sum min max);

# stat storage

my $n=0;
my $seq = '';
my %stat;
my @len;

# MAIN LOOP collecting sequences

while (my $line = <ARGV>) {
chomp $line;
if ($line =~ m/^\s*>/) {
process($seq) if $n;
$n++;
$seq='';
}
else {
$seq .= $line;
}
}

process($seq) if $n;

# sort length array
# (should use hash here for efficiency with huge no of short reads?)

@len = sort { $a <=> $b } @len;

# compute more stats

$stat{'num_seq'} = scalar(@len);

if (@len) {
$stat{'num_bp'} = sum(@len);
$stat{'len_min'} = $len[0];
$stat{'len_max'} = $len[-1];
$stat{'len_median'} = $len[int(@len/2)];
$stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} );
# calculate n50

$stat{'len_N50'} = 0;
my $cum=0;
my $thresh = int 0.5 * $stat{'num_bp'};
for my $i (0 .. $#len) {
$cum += $len[$i];
if ($cum >= $thresh) {
$stat{'len_N50'} = $len[$i];
last;
}
}
}

#calculate GC content
$stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'};
$stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100;

# print stats as .tsv

for my $name (sort keys %stat) {
if ($name =~ m/GC_content/){
printf "%s\t%0.1f\n", $name, $stat{$name};
} else {
printf "%s\t%s\n", $name, $stat{$name};
}
}

# run for each sequence

sub process {
my($s) = @_;
# base composition
for my $x (qw(A G T C N)) {
my $count = $s =~ s/$x/$x/gi;
$stat{"num_$x"} += $count;
}
# keep list of all lengths encountered
push @len, length($s);
}

59 changes: 59 additions & 0 deletions tools/fasta_stats/fasta-stats.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<tool id="fasta-stats" name="Fasta Statistics" version="1.0.1">
<description>Display summary statistics for a fasta file.</description>
<requirements>
<requirement type="package" version="5.26">perl</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
perl '${__tool_directory__}/fasta-stats.pl'
'$dataset'
> '$stats'
]]>
</command>
<inputs>
<param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/>
</inputs>
<outputs>
<data name="stats" format="tabular" label="${tool.name} on ${on_string}: Fasta summary stats"/>
</outputs>
<tests>
<test>
<param name="dataset" value="test.fasta"/>
<output name="stats" file="test_out.txt"/>
</test>
</tests>
<help>
**Fasta Stats**
Displays the summary statistics for a fasta file.

Written by Torsten Seemann - Victorian Bioinformatics Consortium

Wrapped by Simon Gladman - Victorian Bioinformatics Consortium

------

Outputs in tabular form:
Lengths: n50, min, max, median and average

Number of base pairs: A, C, G, T, N, Total and Total_not_N

Number of sequences

GC content in %

------

Inputs:

Fasta dataset
</help>
<citations>
<citation type="bibtex">
@UNPUBLISHED{Seemann_Gladman2012,
author = {Torsten Seemann and Simon Gladman},
title = {Fasta Statistics: Display summary statistics for a fasta file.},
year = {2012},
url = {https://github.com/galaxyproject/tools-iuc},
}
</citation>
</citations>
</tool>
Loading

0 comments on commit 1f0e7a2

Please sign in to comment.