various changes

brettwhitty · Dec 6, 2017 · 29dc9ad · 29dc9ad
1 parent efbdb04
commit 29dc9ad
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 27 deletions.
diff --git a/unsorted/bp_genbank2gff3.pl → gff/convert_to_gff/bp_genbank2gff3.pl b/unsorted/bp_genbank2gff3.pl → gff/convert_to_gff/bp_genbank2gff3.pl
diff --git a/unsorted/bp_genbank2gff3_wrapper.pl → ...convert_to_gff/bp_genbank2gff3_wrapper.pl b/unsorted/bp_genbank2gff3_wrapper.pl → ...convert_to_gff/bp_genbank2gff3_wrapper.pl
@@ -14,7 +14,9 @@
 
 $| = 1;
 
-my $bp_genbank2gff3 = '/home/whitty/SVN/bp_genbank2gff3.pl';
+use FindBin qw{ $RealBin };
+
+my $bp_genbank2gff3 = $RealBin.'/bp_genbank2gff3.pl';
 
 my $cmd_line = shift @ARGV || die "Must provide command line";
 my $infile = shift @ARGV || die "Must provide input file";

diff --git a/gff/convert_to_gff/gmod_fasta2gff3.pl b/gff/convert_to_gff/gmod_fasta2gff3.pl
@@ -0,0 +1,107 @@
+#!/usr/bin/perl -w
+
+use lib "/opt/rocks/lib/perl5/site_perl/5.8.8";
+use lib "/opt/rocks/lib/perl5/5.8.8/";
+
+eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
+    if 0; # not running under some shell
+use strict;
+
+use Getopt::Long;
+use Bio::DB::Fasta;
+
+my ($FASTA_DIR, $GFFFILENAME, $TYPE, $SOURCE, $ATTRIBUTES, $NOSEQUENCE);
+
+GetOptions(
+    'fasta_dir=s'    => \$FASTA_DIR,
+    'gfffilename=s'  => \$GFFFILENAME,
+    'type=s'         => \$TYPE,
+    'source=s'       => \$SOURCE,
+    'attributes=s'   => \$ATTRIBUTES,
+    'nosequence'     => \$NOSEQUENCE,
+  ) or ( system( 'pod2text', $0 ), exit -1 );
+
+my $fastadir = $FASTA_DIR   || './fasta';
+my $gfffile  = $GFFFILENAME || 'out.gff';
+my $type     = $TYPE        || 'EST';
+my $source   = $SOURCE      || '.';
+
+open OUT, ">", $gfffile or die "couldn't open $gfffile for writing: $!\n";
+
+my $stream = Bio::DB::Fasta->new($fastadir)->get_PrimarySeq_stream;
+
+print OUT "##gff-version 3\n";
+print OUT "#this file generated from $0\n";
+while (my $seq = $stream->next_seq) {
+    my $atts;
+    if ($ATTRIBUTES) {
+        $atts = "ID=".$seq->id.";$ATTRIBUTES";
+    }
+    else {
+        $atts = "ID=".$seq->id;
+    }
+    print OUT join("\t",
+                   $seq->id,
+                   $source,
+                   $type,
+                   1,
+                   $seq->length,
+                   ".",".",".",
+                   $atts 
+                  ),"\n";
+}
+
+if (!$NOSEQUENCE) {
+    print OUT "##FASTA\n";
+
+    #reset the seq stream
+    $stream = Bio::DB::Fasta->new($fastadir)->get_PrimarySeq_stream;   
+
+    while (my $seq = $stream->next_seq) {
+        print OUT ">".$seq->id."\n";
+        print OUT $seq->seq . "\n"; 
+    } 
+}
+
+close  OUT;
+
+=pod
+
+=head1 NAME
+
+$O - Convert FASTA to simple GFF3
+
+=head1 SYNOPSYS
+
+  % $O [options]
+
+=head1 COMMAND-LINE OPTIONS
+
+  --fasta_dir		Directory contain fasta files
+                           (default: ./fasta)
+  --gfffilename		Name of GFF3 file to be created
+                           (default: ./out.gff)
+  --type                SO type to assign to each feature
+                           (default: EST)
+  --source		Text to appear in source column
+                           (default: .)
+  --attributes		Additional tag=value pairs to appear in column 9
+  --nosequence		Suppress the ##FASTA section (ie, don't
+			   print DNA sequences)
+
+=head1 DESCRIPTION
+
+This script simply takes a collection of fasta files and converts them
+to simple GFF3 suitable for loading into chado.
+
+=head1 AUTHORS
+
+Scott Cain E<lt>cain@cshl.orgE<gt>
+
+Copyright (c) 2006
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.
+
+=cut
+
diff --git a/unsorted/do_aat_qsub.pl → hpcc_stuff/do_aat_qsub.pl b/unsorted/do_aat_qsub.pl → hpcc_stuff/do_aat_qsub.pl
diff --git a/shell_scripts/solcomp/sol_data_fetch_pipeline.sh b/shell_scripts/solcomp/sol_data_fetch_pipeline.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+## this script's path
+SCRIPT_PATH="`( cd $( dirname \"$0\" ) && pwd )`"
+
 ## bioperl
 export PERL5LIB="/scratch/whitty/gmod/bioperl-live"
 
@@ -8,22 +11,29 @@ set -e
 ## set a umask that will allow group rw
 umask 006;
 
-
 ## set working directory root
-WORK_DIR_ROOT=/projects/solcomp/data_working
+WORK_DIR_ROOT=/tmp/solcomp/data_working
 
+## set tools repo dir
+TOOLS_REPO="${SCRIPT_PATH}/../.."
 
 ## scripts used
-FETCH_GENBANK_GIS="/home/whitty/SVN/gb/fetch_genbank_gis_by_entrez_query.pl"
-FETCH_GENBANK_FLATFILES="/home/whitty/SVN/gb/fetch_gbff_from_genbank_by_gi_list.pl"
-SPLIT_GENBANK_FLATFILES="/home/whitty/SVN/gb/split_genbank_flat_files.pl"
-MERGE_GENBANK_FLATFILES="/home/whitty/SVN/gb/merge_gb_by_list_file.pl"
-GENBANK_TO_GFF3="/home/whitty/SVN/bp_genbank2gff3_wrapper.pl"
-FETCH_PUTS="/home/whitty/SVN/mirror_plantgdb_puts_by_taxon.pl"
-PUTS_FASTA_TO_GFF3="/home/whitty/bin/gmod_fasta2gff3.pl" ## copied from GMOD installation
-TAXONOMY_TEST="/home/whitty/SVN/taxonomy/taxon_id_is_solanaceae.pl"
-GET_SCIENTIFIC_NAME="/home/whitty/SVN/taxonomy/get_scientific_name.pl"
-SCP_FILES="/home/whitty/SVN/scp_sequences_to_ftp_site.pl"
+#### ...for fetching Solanaceae sequence records from GenBank
+FETCH_GENBANK_GIS="${TOOLS_REPO}/gb/fetch_genbank_gis_by_entrez_query.pl"
+FETCH_GENBANK_FLATFILES="${TOOLS_REPO}/gb/fetch_gbff_from_genbank_by_gi_list.pl"
+SPLIT_GENBANK_FLATFILES="${TOOLS_REPO}/gb/split_genbank_flat_files.pl"
+MERGE_GENBANK_FLATFILES="${TOOLS_REPO}/gb/merge_gb_by_list_file.pl"
+#### ...for converting GB flat files to GFF3
+GENBANK_TO_GFF3="${TOOLS_REPO}/gff/convert_to_gff/bp_genbank2gff3_wrapper.pl" ## see comments in this script
+#### ...for fetching PlantGDB PUTs files
+FETCH_PUTS="${TOOLS_REPO}/solcomp/site-utils/mirror_plantgdb_puts_by_taxon.pl"
+### ...for converting PUTs fasta to GFF3
+PUTS_FASTA_TO_GFF3="${TOOLS_REPO}/gff/convert_to_gff/gmod_fasta2gff3.pl" ## copied from GMOD installation
+### ...check that records fetched are Solanaceae (by taxon ID)
+TAXONOMY_TEST="${TOOLS_REPO}/taxonomy/taxon_id_is_solanaceae.pl"
+GET_SCIENTIFIC_NAME="${TOOLS_REPO}/taxonomy/get_scientific_name.pl"
+### ...for copying the GBFF, GFF3, FASTA, etc. files created to the FTP site
+SCP_FILES="${TOOLS_REPO}/solcomp/site-utils/scp_sequences_to_ftp_site.pl"
 
 
 ## flag for allowing resuming of previous runs

diff --git a/unsorted/mirror_plantgdb_puts_by_taxon.pl → ...te-utils/mirror_plantgdb_puts_by_taxon.pl b/unsorted/mirror_plantgdb_puts_by_taxon.pl → ...te-utils/mirror_plantgdb_puts_by_taxon.pl
diff --git a/unsorted/scp_sequences_to_ftp_site.pl → ...p/site-utils/scp_sequences_to_ftp_site.pl b/unsorted/scp_sequences_to_ftp_site.pl → ...p/site-utils/scp_sequences_to_ftp_site.pl
diff --git a/unsorted/JSON_test.pl b/unsorted/JSON_test.pl