Skip to content

Commit

Permalink
various changes
Browse files Browse the repository at this point in the history
  • Loading branch information
brettwhitty committed Dec 6, 2017
1 parent efbdb04 commit 29dc9ad
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 27 deletions.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

$| = 1;

my $bp_genbank2gff3 = '/home/whitty/SVN/bp_genbank2gff3.pl';
use FindBin qw{ $RealBin };

my $bp_genbank2gff3 = $RealBin.'/bp_genbank2gff3.pl';

my $cmd_line = shift @ARGV || die "Must provide command line";
my $infile = shift @ARGV || die "Must provide input file";
Expand Down
107 changes: 107 additions & 0 deletions gff/convert_to_gff/gmod_fasta2gff3.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/perl -w

use lib "/opt/rocks/lib/perl5/site_perl/5.8.8";
use lib "/opt/rocks/lib/perl5/5.8.8/";

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
if 0; # not running under some shell
use strict;

use Getopt::Long;
use Bio::DB::Fasta;

my ($FASTA_DIR, $GFFFILENAME, $TYPE, $SOURCE, $ATTRIBUTES, $NOSEQUENCE);

GetOptions(
'fasta_dir=s' => \$FASTA_DIR,
'gfffilename=s' => \$GFFFILENAME,
'type=s' => \$TYPE,
'source=s' => \$SOURCE,
'attributes=s' => \$ATTRIBUTES,
'nosequence' => \$NOSEQUENCE,
) or ( system( 'pod2text', $0 ), exit -1 );

my $fastadir = $FASTA_DIR || './fasta';
my $gfffile = $GFFFILENAME || 'out.gff';
my $type = $TYPE || 'EST';
my $source = $SOURCE || '.';

open OUT, ">", $gfffile or die "couldn't open $gfffile for writing: $!\n";

my $stream = Bio::DB::Fasta->new($fastadir)->get_PrimarySeq_stream;

print OUT "##gff-version 3\n";
print OUT "#this file generated from $0\n";
while (my $seq = $stream->next_seq) {
my $atts;
if ($ATTRIBUTES) {
$atts = "ID=".$seq->id.";$ATTRIBUTES";
}
else {
$atts = "ID=".$seq->id;
}
print OUT join("\t",
$seq->id,
$source,
$type,
1,
$seq->length,
".",".",".",
$atts
),"\n";
}

if (!$NOSEQUENCE) {
print OUT "##FASTA\n";

#reset the seq stream
$stream = Bio::DB::Fasta->new($fastadir)->get_PrimarySeq_stream;

while (my $seq = $stream->next_seq) {
print OUT ">".$seq->id."\n";
print OUT $seq->seq . "\n";
}
}

close OUT;

=pod
=head1 NAME
$O - Convert FASTA to simple GFF3
=head1 SYNOPSYS
% $O [options]
=head1 COMMAND-LINE OPTIONS
--fasta_dir Directory contain fasta files
(default: ./fasta)
--gfffilename Name of GFF3 file to be created
(default: ./out.gff)
--type SO type to assign to each feature
(default: EST)
--source Text to appear in source column
(default: .)
--attributes Additional tag=value pairs to appear in column 9
--nosequence Suppress the ##FASTA section (ie, don't
print DNA sequences)
=head1 DESCRIPTION
This script simply takes a collection of fasta files and converts them
to simple GFF3 suitable for loading into chado.
=head1 AUTHORS
Scott Cain E<lt>cain@cshl.orgE<gt>
Copyright (c) 2006
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut

File renamed without changes.
34 changes: 22 additions & 12 deletions shell_scripts/solcomp/sol_data_fetch_pipeline.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

## this script's path
SCRIPT_PATH="`( cd $( dirname \"$0\" ) && pwd )`"

## bioperl
export PERL5LIB="/scratch/whitty/gmod/bioperl-live"

Expand All @@ -8,22 +11,29 @@ set -e
## set a umask that will allow group rw
umask 006;


## set working directory root
WORK_DIR_ROOT=/projects/solcomp/data_working
WORK_DIR_ROOT=/tmp/solcomp/data_working

## set tools repo dir
TOOLS_REPO="${SCRIPT_PATH}/../.."

## scripts used
FETCH_GENBANK_GIS="/home/whitty/SVN/gb/fetch_genbank_gis_by_entrez_query.pl"
FETCH_GENBANK_FLATFILES="/home/whitty/SVN/gb/fetch_gbff_from_genbank_by_gi_list.pl"
SPLIT_GENBANK_FLATFILES="/home/whitty/SVN/gb/split_genbank_flat_files.pl"
MERGE_GENBANK_FLATFILES="/home/whitty/SVN/gb/merge_gb_by_list_file.pl"
GENBANK_TO_GFF3="/home/whitty/SVN/bp_genbank2gff3_wrapper.pl"
FETCH_PUTS="/home/whitty/SVN/mirror_plantgdb_puts_by_taxon.pl"
PUTS_FASTA_TO_GFF3="/home/whitty/bin/gmod_fasta2gff3.pl" ## copied from GMOD installation
TAXONOMY_TEST="/home/whitty/SVN/taxonomy/taxon_id_is_solanaceae.pl"
GET_SCIENTIFIC_NAME="/home/whitty/SVN/taxonomy/get_scientific_name.pl"
SCP_FILES="/home/whitty/SVN/scp_sequences_to_ftp_site.pl"
#### ...for fetching Solanaceae sequence records from GenBank
FETCH_GENBANK_GIS="${TOOLS_REPO}/gb/fetch_genbank_gis_by_entrez_query.pl"
FETCH_GENBANK_FLATFILES="${TOOLS_REPO}/gb/fetch_gbff_from_genbank_by_gi_list.pl"
SPLIT_GENBANK_FLATFILES="${TOOLS_REPO}/gb/split_genbank_flat_files.pl"
MERGE_GENBANK_FLATFILES="${TOOLS_REPO}/gb/merge_gb_by_list_file.pl"
#### ...for converting GB flat files to GFF3
GENBANK_TO_GFF3="${TOOLS_REPO}/gff/convert_to_gff/bp_genbank2gff3_wrapper.pl" ## see comments in this script
#### ...for fetching PlantGDB PUTs files
FETCH_PUTS="${TOOLS_REPO}/solcomp/site-utils/mirror_plantgdb_puts_by_taxon.pl"
### ...for converting PUTs fasta to GFF3
PUTS_FASTA_TO_GFF3="${TOOLS_REPO}/gff/convert_to_gff/gmod_fasta2gff3.pl" ## copied from GMOD installation
### ...check that records fetched are Solanaceae (by taxon ID)
TAXONOMY_TEST="${TOOLS_REPO}/taxonomy/taxon_id_is_solanaceae.pl"
GET_SCIENTIFIC_NAME="${TOOLS_REPO}/taxonomy/get_scientific_name.pl"
### ...for copying the GBFF, GFF3, FASTA, etc. files created to the FTP site
SCP_FILES="${TOOLS_REPO}/solcomp/site-utils/scp_sequences_to_ftp_site.pl"


## flag for allowing resuming of previous runs
Expand Down
File renamed without changes.
File renamed without changes.
14 changes: 0 additions & 14 deletions unsorted/JSON_test.pl

This file was deleted.

0 comments on commit 29dc9ad

Please sign in to comment.