Skip to content
This repository has been archived by the owner on Jan 2, 2019. It is now read-only.

Commit

Permalink
blast
Browse files Browse the repository at this point in the history
  • Loading branch information
elizabeth de vogelaere committed Nov 17, 2015
1 parent 4f4e898 commit 07c6815
Show file tree
Hide file tree
Showing 9 changed files with 341 additions and 0 deletions.
3 changes: 3 additions & 0 deletions hpc-blast/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
blastdb
blast-results
std-*
32 changes: 32 additions & 0 deletions hpc-blast/blast-pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/sh

#Step1: get the env variables and create a list of files
source ./config.sh

# Step 2: get all of the fasta files for input
cd "$FASTA_DIR"
export FILES_LIST="$FASTA_DIR/files-list"
pwd
ls *.fasta | sed "s/^\.\///" > $FILES_LIST

# Step 3: split the fasta files into smaller chunks
$SCRIPT_DIR/run-fasta-splitter.sh

# Step 4: run blast on each of the file chunks
while read FILE; do
export FILE="$FILE"
cd $SPLIT_FA_DIR
export SPLIT_FILES_LIST="$SPLIT_FA_DIR/split.$FILE"
ls $FILE.* > $SPLIT_FILES_LIST
NUM_SPLIT_FILES=$(wc -l $SPLIT_FILES_LIST | cut -d ' ' -f 1)

## run blast on each of the smaller chunks against a blast database
#JOB_ID1=$(qsub -v SCRIPT_DIR,SPLIT_FA_DIR,BLAST,EVAL,BLASTDB,BLAST_OUT_DIR,FILE -N blast -e "$STDERR_DIR" -o "$STDOUT_DIR" -J 1-$NUM_SPLIT_FILES $SCRIPT_DIR/run-blast.sh)

## parse the blast results for each of the chunked files
#JOB_ID2=$(qsub -v SCRIPT_DIR,BLAST_OUT_DIR,FILE -W depend=afterany:$JOB_ID1 -e "$STDERR_DIR" -o "$STDOUT_DIR" -J 1-$NUM_SPLIT_FILES $SCRIPT_DIR/run-parse-blast.sh)

#JOB_ID2=$(qsub -v SCRIPT_DIR,BLAST_OUT_DIR,FILE -o "$STDOUT_DIR" -J 1-$NUM_SPLIT_FILES $SCRIPT_DIR/run-parse-blast.sh)

#JOB_ID3=$(qsub -v SCRIPT_DIR,BLAST_OUT_DIR,FILE -o "$STDOUT_DIR" -J 1-$NUM_SPLIT_FILES $SCRIPT_DIR/num16.sh)
done < $FILES_LIST
16 changes: 16 additions & 0 deletions hpc-blast/config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

#directories we need
export CWD=$PWD
export SCRIPT_DIR="$CWD/scripts"
export FASTA_DIR="$CWD/data"
export STDERR_DIR="$CWD/std-err"
export STDOUT_DIR="$CWD/std-out"
export SPLIT_FA_DIR="$FASTA_DIR/faSplit"
export SPLIT_READ_CT="25000"
export BLAST_OUT_DIR="$CWD/blast-results"

# BLAST parameters
export EVAL="1e-10"
export BLAST="blastx"
export BLASTDB="$CWD/blastdb/CAM_PROJ_MarineMicrobes.pep.fa"
96 changes: 96 additions & 0 deletions hpc-blast/scripts/01-bio-searchio.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env perl

use strict;
use warnings;
use autodie;
use Bio::SearchIO;
use feature 'say';
use Getopt::Long;
use Pod::Usage;

my %opts = get_opts();
my @args = @ARGV;

if ($opts{'help'} || $opts{'man'}) {
pod2usage({
-exitval => 0,
-verbose => $opts{'man'} ? 2 : 1
});
}

my $in = new Bio::SearchIO(-format => 'blast',
-file => @ARGV);

say join "\t", qw(query_name name evalue);

while ( my $result = $in->next_result ) {
while ( my $hit = $result->next_hit ) {
while ( my $hsp = $hit->next_hsp ) {
if ( $hsp->evalue < 1e-10 ) {
say join "\t",
$result->query_name,
$hit->name,
$hit->description,
$hsp->percent_identity,
$hsp->length('total'),
$hsp->evalue;
}
}
}
}

# --------------------------------------------------
sub get_opts {
my %opts;
GetOptions(
\%opts,
'help',
'man',
) or pod2usage(2);

return %opts;
}

__END__
# --------------------------------------------------
=pod
=head1 NAME
01-bio-searchio.pl - a script
=head1 SYNOPSIS
01-bio-searchio.pl
Options:
--help Show brief help and exit
--man Show full documentation
=head1 DESCRIPTION
Describe what the script does, what input it expects, what output it
creates, etc.
=head1 SEE ALSO
perl.
=head1 AUTHOR
Beth De Vogelaere E<lt>edevog@email.arizona.eduE<gt>.
=head1 COPYRIGHT
Copyright (c) 2015 Beth De Vogelaere
This module is free software; you can redistribute it and/or
modify it under the terms of the GPL (either version 1, or at
your option, any later version) or the Artistic License 2.0.
Refer to LICENSE for the full license text and to DISCLAIMER for
additional warranty disclaimers.
=cut
117 changes: 117 additions & 0 deletions hpc-blast/scripts/01-fasta-splitter.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env perl
#01-fasta-splitter.pl
use strict;
use warnings;
use autodie;
use feature 'say';
use Bio::SeqIO;
use Cwd 'cwd';
use File::Basename 'basename';
use File::Path 'make_path';
use File::Spec::Functions 'catfile';
use Getopt::Long;
use Pod::Usage;

my %opts = get_opts();
my @args = @ARGV or pod2usage();

if ($opts{'help'} || $opts{'man'}) {
pod2usage({
-exitval => 0,
-verbose => $opts{'man'} ? 2 : 1
});
}

my $maximum = $opts{'number'} || 500;
my $out_dir = $opts{'out_dir'} || cwd();

unless (-d $out_dir){
make_path($out_dir);
}

say "max ($maximum)";

for my $file (@ARGV) {
my $in = Bio::SeqIO->new(-file => $file ,
-format => 'Fasta');

my $count = 0;
my $file_count = 0;
my $writer;
while ( my $seq = $in->next_seq() ) {
$count++;
if (!$writer || $count == $maximum){
$file_count++;
$count=0;

my $out_file = catfile($out_dir,basename($file).'.'.$file_count);

say $out_file;
$writer = Bio::SeqIO->new(-file => ">$out_file",
-format => 'Fasta');
}
say join ": ", $count, $seq->id;
$writer->write_seq($seq);
}
}

say "OK";

# --------------------------------------------------
sub get_opts {
my %opts;
GetOptions(
\%opts,
'number:i',
'out_dir:s',
'help',
'man',
) or pod2usage(2);

return %opts;
}

__END__
# --------------------------------------------------
=pod
=head1 NAME
01-fasta-splitter.pl - problem 1
=head1 SYNOPSIS
01-fasta-splitter.pl -n 20 -o ~/split file1.fa [file2.fa ...]
Options:
--number The maximum number of sequences per file (500)
--out_dir Output directory (cwd)
--help Show brief help and exit
--man Show full documentation
=head1 DESCRIPTION
Splits FASTA files into smaller files with a maximum number of sequences (default 500) into a given output directory (default the current working directory).
=head1 SEE ALSO
perl.
=head1 AUTHOR
Beth De Vogelaere E<lt>edevog@email.arizona.eduE<gt>.
=head1 COPYRIGHT
Copyright (c) 2015 Beth De Vogelaere
This module is free software; you can redistribute it and/or
modify it under the terms of the GPL (either version 1, or at
your option, any later version) or the Artistic License 2.0.
Refer to LICENSE for the full license text and to DISCLAIMER for
additional warranty disclaimers.
=cut
15 changes: 15 additions & 0 deletions hpc-blast/scripts/num16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

#PBS -W group_list=bhurwitz
#PBS -q windfall
#PBS -l jobtype=cluster_only
#PBS -l select=1:ncpus=2:mem=4gb
#PBS -l place=pak:shared
#PBS -l walltime=24:00:00
#PBS -l cput=24:00:00
#PBS -M edevog@email.arizona.edu
#PBS -m bea

cd BLAST_OUT_DIR

cat $FILE.*.parsed > $FILE.all.parsed
24 changes: 24 additions & 0 deletions hpc-blast/scripts/run-blast.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

#PBS -W group_list=bhurwitz
#PBS -q windfall
#PBS -l jobtype=cluster_only
#PBS -l select=1:ncpus=2:mem=4gb
#PBS -l place=pack:shared
#PBS -l walltime=24:00:00
#PBS -l cput=24:00:00
#PBS -M edevog@email.arizona.edu
#PBS -m bea

# BLAST parameters
NUM_THREADS=2 # make sure this is requested in the above "select"
DESC=10
ALN=10
MAX_HSPS=10

module load blast
cd "$SPLIT_FA_DIR"
FASTA="$SPLIT_FA_DIR/$FILE.${PBS_ARRAY_INDEX}"

# run blast on each file chunk
$BLAST -query $FASTA -db $BLASTDB -evalue $EVAL -out $BLAST_OUT_DIR/$FILE.${PBS_ARRAY_INDEX}.blastout -max_hsps $MAX_HSPS -num_descriptions $DESC -num_alignments $ALN -num_threads $NUM_THREADS
20 changes: 20 additions & 0 deletions hpc-blast/scripts/run-fasta-splitter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

source /usr/share/Modules/init/bash
module load perl

if [[ ! -d "$SPLIT_FA_DIR" ]]; then
echo Cannot find faSplit \"$SPLIT_FA_DIR\"
mkdir -p $SPLIT_FA_DIR
fi

if [[ ! -e "$FILES_LIST" ]]; then
echo Cannot find files list \"$FILES_LIST\"
exit 1
fi

cd $FASTA_DIR
while read FILE; do
echo $FILE
$SCRIPT_DIR/01-fasta-splitter.pl -n $SPLIT_READ_CT -o $SPLIT_FA_DIR $FILE
done < $FILES_LIST
18 changes: 18 additions & 0 deletions hpc-blast/scripts/run-parse-blast.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

#PBS -W group_list=bhurwitz
#PBS -q windfall
#PBS -l jobtype=cluster_only
#PBS -l select=1:ncpus=2:mem=4gb
#PBS -l place=pack:shared
#PBS -l walltime=24:00:00
#PBS -l cput=24:00:00
#PBS -M edevog@email.arizona.edu
#PBS -m bea

module load blast
module load perl
cd $BLAST_OUT_DIR
BLOUT="$BLAST_OUT_DIR/$FILE.${PBS_ARRAY_INDEX}.blastout"
echo File \"$FILE.${PBS_ARRAY_INDEX}.blastout\"
$SCRIPT_DIR/01-bio-searchio.pl $BLOUT > $BLOUT.parsed

0 comments on commit 07c6815

Please sign in to comment.