This repository has been archived by the owner on Jan 2, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
elizabeth de vogelaere
committed
Nov 17, 2015
1 parent
4f4e898
commit 07c6815
Showing
9 changed files
with
341 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
blastdb | ||
blast-results | ||
std-* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#!/bin/sh | ||
|
||
#Step1: get the env variables and create a list of files | ||
source ./config.sh | ||
|
||
# Step 2: get all of the fasta files for input | ||
cd "$FASTA_DIR" | ||
export FILES_LIST="$FASTA_DIR/files-list" | ||
pwd | ||
ls *.fasta | sed "s/^\.\///" > $FILES_LIST | ||
|
||
# Step 3: split the fasta files into smaller chunks | ||
$SCRIPT_DIR/run-fasta-splitter.sh | ||
|
||
# Step 4: run blast on each of the file chunks | ||
while read FILE; do | ||
export FILE="$FILE" | ||
cd $SPLIT_FA_DIR | ||
export SPLIT_FILES_LIST="$SPLIT_FA_DIR/split.$FILE" | ||
ls $FILE.* > $SPLIT_FILES_LIST | ||
NUM_SPLIT_FILES=$(wc -l $SPLIT_FILES_LIST | cut -d ' ' -f 1) | ||
|
||
## run blast on each of the smaller chunks against a blast database | ||
#JOB_ID1=$(qsub -v SCRIPT_DIR,SPLIT_FA_DIR,BLAST,EVAL,BLASTDB,BLAST_OUT_DIR,FILE -N blast -e "$STDERR_DIR" -o "$STDOUT_DIR" -J 1-$NUM_SPLIT_FILES $SCRIPT_DIR/run-blast.sh) | ||
|
||
## parse the blast results for each of the chunked files | ||
#JOB_ID2=$(qsub -v SCRIPT_DIR,BLAST_OUT_DIR,FILE -W depend=afterany:$JOB_ID1 -e "$STDERR_DIR" -o "$STDOUT_DIR" -J 1-$NUM_SPLIT_FILES $SCRIPT_DIR/run-parse-blast.sh) | ||
|
||
#JOB_ID2=$(qsub -v SCRIPT_DIR,BLAST_OUT_DIR,FILE -o "$STDOUT_DIR" -J 1-$NUM_SPLIT_FILES $SCRIPT_DIR/run-parse-blast.sh) | ||
|
||
#JOB_ID3=$(qsub -v SCRIPT_DIR,BLAST_OUT_DIR,FILE -o "$STDOUT_DIR" -J 1-$NUM_SPLIT_FILES $SCRIPT_DIR/num16.sh) | ||
done < $FILES_LIST |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
#directories we need | ||
export CWD=$PWD | ||
export SCRIPT_DIR="$CWD/scripts" | ||
export FASTA_DIR="$CWD/data" | ||
export STDERR_DIR="$CWD/std-err" | ||
export STDOUT_DIR="$CWD/std-out" | ||
export SPLIT_FA_DIR="$FASTA_DIR/faSplit" | ||
export SPLIT_READ_CT="25000" | ||
export BLAST_OUT_DIR="$CWD/blast-results" | ||
|
||
# BLAST parameters | ||
export EVAL="1e-10" | ||
export BLAST="blastx" | ||
export BLASTDB="$CWD/blastdb/CAM_PROJ_MarineMicrobes.pep.fa" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
use warnings; | ||
use autodie; | ||
use Bio::SearchIO; | ||
use feature 'say'; | ||
use Getopt::Long; | ||
use Pod::Usage; | ||
|
||
my %opts = get_opts(); | ||
my @args = @ARGV; | ||
|
||
if ($opts{'help'} || $opts{'man'}) { | ||
pod2usage({ | ||
-exitval => 0, | ||
-verbose => $opts{'man'} ? 2 : 1 | ||
}); | ||
} | ||
|
||
my $in = new Bio::SearchIO(-format => 'blast', | ||
-file => @ARGV); | ||
|
||
say join "\t", qw(query_name name evalue); | ||
|
||
while ( my $result = $in->next_result ) { | ||
while ( my $hit = $result->next_hit ) { | ||
while ( my $hsp = $hit->next_hsp ) { | ||
if ( $hsp->evalue < 1e-10 ) { | ||
say join "\t", | ||
$result->query_name, | ||
$hit->name, | ||
$hit->description, | ||
$hsp->percent_identity, | ||
$hsp->length('total'), | ||
$hsp->evalue; | ||
} | ||
} | ||
} | ||
} | ||
|
||
# -------------------------------------------------- | ||
sub get_opts { | ||
my %opts; | ||
GetOptions( | ||
\%opts, | ||
'help', | ||
'man', | ||
) or pod2usage(2); | ||
|
||
return %opts; | ||
} | ||
|
||
__END__ | ||
# -------------------------------------------------- | ||
=pod | ||
=head1 NAME | ||
01-bio-searchio.pl - a script | ||
=head1 SYNOPSIS | ||
01-bio-searchio.pl | ||
Options: | ||
--help Show brief help and exit | ||
--man Show full documentation | ||
=head1 DESCRIPTION | ||
Describe what the script does, what input it expects, what output it | ||
creates, etc. | ||
=head1 SEE ALSO | ||
perl. | ||
=head1 AUTHOR | ||
Beth De Vogelaere E<lt>edevog@email.arizona.eduE<gt>. | ||
=head1 COPYRIGHT | ||
Copyright (c) 2015 Beth De Vogelaere | ||
This module is free software; you can redistribute it and/or | ||
modify it under the terms of the GPL (either version 1, or at | ||
your option, any later version) or the Artistic License 2.0. | ||
Refer to LICENSE for the full license text and to DISCLAIMER for | ||
additional warranty disclaimers. | ||
=cut |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#!/usr/bin/env perl | ||
#01-fasta-splitter.pl | ||
use strict; | ||
use warnings; | ||
use autodie; | ||
use feature 'say'; | ||
use Bio::SeqIO; | ||
use Cwd 'cwd'; | ||
use File::Basename 'basename'; | ||
use File::Path 'make_path'; | ||
use File::Spec::Functions 'catfile'; | ||
use Getopt::Long; | ||
use Pod::Usage; | ||
|
||
my %opts = get_opts(); | ||
my @args = @ARGV or pod2usage(); | ||
|
||
if ($opts{'help'} || $opts{'man'}) { | ||
pod2usage({ | ||
-exitval => 0, | ||
-verbose => $opts{'man'} ? 2 : 1 | ||
}); | ||
} | ||
|
||
my $maximum = $opts{'number'} || 500; | ||
my $out_dir = $opts{'out_dir'} || cwd(); | ||
|
||
unless (-d $out_dir){ | ||
make_path($out_dir); | ||
} | ||
|
||
say "max ($maximum)"; | ||
|
||
for my $file (@ARGV) { | ||
my $in = Bio::SeqIO->new(-file => $file , | ||
-format => 'Fasta'); | ||
|
||
my $count = 0; | ||
my $file_count = 0; | ||
my $writer; | ||
while ( my $seq = $in->next_seq() ) { | ||
$count++; | ||
if (!$writer || $count == $maximum){ | ||
$file_count++; | ||
$count=0; | ||
|
||
my $out_file = catfile($out_dir,basename($file).'.'.$file_count); | ||
|
||
say $out_file; | ||
$writer = Bio::SeqIO->new(-file => ">$out_file", | ||
-format => 'Fasta'); | ||
} | ||
say join ": ", $count, $seq->id; | ||
$writer->write_seq($seq); | ||
} | ||
} | ||
|
||
say "OK"; | ||
|
||
# -------------------------------------------------- | ||
sub get_opts { | ||
my %opts; | ||
GetOptions( | ||
\%opts, | ||
'number:i', | ||
'out_dir:s', | ||
'help', | ||
'man', | ||
) or pod2usage(2); | ||
|
||
return %opts; | ||
} | ||
|
||
__END__ | ||
# -------------------------------------------------- | ||
=pod | ||
=head1 NAME | ||
01-fasta-splitter.pl - problem 1 | ||
=head1 SYNOPSIS | ||
01-fasta-splitter.pl -n 20 -o ~/split file1.fa [file2.fa ...] | ||
Options: | ||
--number The maximum number of sequences per file (500) | ||
--out_dir Output directory (cwd) | ||
--help Show brief help and exit | ||
--man Show full documentation | ||
=head1 DESCRIPTION | ||
Splits FASTA files into smaller files with a maximum number of sequences (default 500) into a given output directory (default the current working directory). | ||
=head1 SEE ALSO | ||
perl. | ||
=head1 AUTHOR | ||
Beth De Vogelaere E<lt>edevog@email.arizona.eduE<gt>. | ||
=head1 COPYRIGHT | ||
Copyright (c) 2015 Beth De Vogelaere | ||
This module is free software; you can redistribute it and/or | ||
modify it under the terms of the GPL (either version 1, or at | ||
your option, any later version) or the Artistic License 2.0. | ||
Refer to LICENSE for the full license text and to DISCLAIMER for | ||
additional warranty disclaimers. | ||
=cut |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#!/bin/bash | ||
|
||
#PBS -W group_list=bhurwitz | ||
#PBS -q windfall | ||
#PBS -l jobtype=cluster_only | ||
#PBS -l select=1:ncpus=2:mem=4gb | ||
#PBS -l place=pak:shared | ||
#PBS -l walltime=24:00:00 | ||
#PBS -l cput=24:00:00 | ||
#PBS -M edevog@email.arizona.edu | ||
#PBS -m bea | ||
|
||
cd BLAST_OUT_DIR | ||
|
||
cat $FILE.*.parsed > $FILE.all.parsed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/bin/bash | ||
|
||
#PBS -W group_list=bhurwitz | ||
#PBS -q windfall | ||
#PBS -l jobtype=cluster_only | ||
#PBS -l select=1:ncpus=2:mem=4gb | ||
#PBS -l place=pack:shared | ||
#PBS -l walltime=24:00:00 | ||
#PBS -l cput=24:00:00 | ||
#PBS -M edevog@email.arizona.edu | ||
#PBS -m bea | ||
|
||
# BLAST parameters | ||
NUM_THREADS=2 # make sure this is requested in the above "select" | ||
DESC=10 | ||
ALN=10 | ||
MAX_HSPS=10 | ||
|
||
module load blast | ||
cd "$SPLIT_FA_DIR" | ||
FASTA="$SPLIT_FA_DIR/$FILE.${PBS_ARRAY_INDEX}" | ||
|
||
# run blast on each file chunk | ||
$BLAST -query $FASTA -db $BLASTDB -evalue $EVAL -out $BLAST_OUT_DIR/$FILE.${PBS_ARRAY_INDEX}.blastout -max_hsps $MAX_HSPS -num_descriptions $DESC -num_alignments $ALN -num_threads $NUM_THREADS |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/bin/bash | ||
|
||
source /usr/share/Modules/init/bash | ||
module load perl | ||
|
||
if [[ ! -d "$SPLIT_FA_DIR" ]]; then | ||
echo Cannot find faSplit \"$SPLIT_FA_DIR\" | ||
mkdir -p $SPLIT_FA_DIR | ||
fi | ||
|
||
if [[ ! -e "$FILES_LIST" ]]; then | ||
echo Cannot find files list \"$FILES_LIST\" | ||
exit 1 | ||
fi | ||
|
||
cd $FASTA_DIR | ||
while read FILE; do | ||
echo $FILE | ||
$SCRIPT_DIR/01-fasta-splitter.pl -n $SPLIT_READ_CT -o $SPLIT_FA_DIR $FILE | ||
done < $FILES_LIST |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/bash | ||
|
||
#PBS -W group_list=bhurwitz | ||
#PBS -q windfall | ||
#PBS -l jobtype=cluster_only | ||
#PBS -l select=1:ncpus=2:mem=4gb | ||
#PBS -l place=pack:shared | ||
#PBS -l walltime=24:00:00 | ||
#PBS -l cput=24:00:00 | ||
#PBS -M edevog@email.arizona.edu | ||
#PBS -m bea | ||
|
||
module load blast | ||
module load perl | ||
cd $BLAST_OUT_DIR | ||
BLOUT="$BLAST_OUT_DIR/$FILE.${PBS_ARRAY_INDEX}.blastout" | ||
echo File \"$FILE.${PBS_ARRAY_INDEX}.blastout\" | ||
$SCRIPT_DIR/01-bio-searchio.pl $BLOUT > $BLOUT.parsed |