Skip to content

Commit

Permalink
Add in tblastn function mainly implemented by Andrew
Browse files Browse the repository at this point in the history
  • Loading branch information
milnus committed Nov 6, 2022
1 parent b45235e commit 0c4df86
Show file tree
Hide file tree
Showing 16 changed files with 191 additions and 22 deletions.
21 changes: 12 additions & 9 deletions Magphi/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import logging
from sys import argv
from math import ceil
import pkg_resources # ??
import pkg_resources
import concurrent.futures

try:
Expand All @@ -35,9 +35,9 @@
from exit_with_error import exit_with_error

try:
from Magphi.check_inputs import check_inputs
from Magphi.check_inputs import check_inputs, check_seed_type
except ModuleNotFoundError:
from check_inputs import check_inputs
from check_inputs import check_inputs, check_seed_type

try:
from Magphi.split_gff_file import split_gff_files
Expand Down Expand Up @@ -141,7 +141,6 @@ def main():
start_time = time.time()

# Retrieve the flags given by the user in the commandline
# TODO - add in argument to change tmp directory to user defined place (scratch which is faster?)
cmd_args = get_commandline_arguments(argv[1:], PROGRAM_VERSION)

# Try to construct the output folder and except if it does exist
Expand All @@ -150,11 +149,8 @@ def main():
except FileExistsError:
warnings.warn("Output folder already exists")
pass

# add in proteins flag
proteins = cmd_args.protein_seed

"Orchestrate the execution of the program"
# Orchestrate the execution of the program
file_logger = init_logging(cmd_args.log, cmd_args.quiet, cmd_args.out_path)

# Check dependencies for Magphi and logging of versions to file
Expand All @@ -173,7 +169,14 @@ def main():
# Check the input files
file_type, is_input_gzipped = check_inputs(cmd_args.genomes, file_logger)

# Check if input seeds are nucleotide or proteins
protein_seed_check = check_seed_type(cmd_args.seeds, file_logger)
if cmd_args.protein_seed is not protein_seed_check:
warnings.warn('Detected seed type does not match user specified seed type - Assuming seeds are protein sequences')
cmd_args.protein_seed = protein_seed_check

# construct a temporary folder to hold files
# TODO - Use the standard pythonic way of constructing temporary dicts! - See Corekaburra
file_logger.debug("Try to construct output folder")
tmp_folder = os.path.join(cmd_args.out_path, "Magphi_tmp_folder")
try:
Expand Down Expand Up @@ -210,7 +213,7 @@ def main():
with concurrent.futures.ThreadPoolExecutor(max_workers=cmd_args.cpu) as executor:
results = [executor.submit(screen_genome_for_seeds, cmd_args.genomes[i], seed_pairs, cmd_args.seeds,
tmp_folder, cmd_args.include_seeds, file_type,
cmd_args.out_path, cmd_args.max_seed_dist, file_logger, is_input_gzipped, print_seq_out, proteins)
cmd_args.out_path, cmd_args.max_seed_dist, file_logger, is_input_gzipped, print_seq_out, cmd_args.protein_seed)
for i, genome in enumerate(cmd_args.genomes)]

for f in concurrent.futures.as_completed(results):
Expand Down
62 changes: 62 additions & 0 deletions Magphi/check_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,65 @@ def check_inputs(input_files, file_logger):
exit_status=EXIT_INPUT_FILE_ERROR)

return file_type, is_input_gzipped


def check_string_alphabet(string_dict, alphabet):
# Test if all seeds use the alphabet given
seed_list = []
for seed in string_dict.values():
seed_char_list = []
for char in set(seed):
if char.upper() in alphabet:
seed_char_list.append(True)
else:
seed_char_list.append(False)

seed_list.append(all(seed_char_list))
# stop if one non alphabet matching sequence is identified
if not all(seed_list):
break

return all(seed_list)


def check_seed_type(seed_file_path, file_logger):
"""
Function to take a seed file input and determine seeds are likely to be nucleotide, protein, or invalid.
:param seed_file_path: Filepath for the seed file, holding all input seeds to be searched
:param file_logger: Logger for outputs.
:return: Boolean indicating if the seed file hold protein seed sequences.
"""
file_logger.debug('Testing input seeds')

amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
nucleotides = ['A', 'T', 'C', 'G']

# Construct dict to hold seeds
seed_dict = {}
# Read in seed file
with open(seed_file_path, 'r') as seed_file:
for line in seed_file:
line = line.strip()
if ">" in line:
seed_dict[line] = ''
current_line = line
else:
seed_dict[current_line] = seed_dict[current_line] + line

# Test if nucleotide:
is_nucleotide = check_string_alphabet(seed_dict, nucleotides)

if not is_nucleotide:
# Check if seeds are amino acids
is_aminoacids = check_string_alphabet(seed_dict, amino_acids)
else:
file_logger.debug('Testing input seeds are nucleotide')
# Everything is nucleotide - return
return False

if not is_aminoacids:
file_logger.debug('Testing input seeds are faulty!')
exit_with_error(message='Bad input seeds - Could be illegal characters', exit_status=EXIT_INPUT_FILE_ERROR)
else:
file_logger.debug('Testing input seeds are protein')
return True
12 changes: 6 additions & 6 deletions Magphi/commandline_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

EXIT_COMMAND_LINE_ERROR = 2

# def get_commandline_arguments(args, version):

def get_commandline_arguments(args, version):
"""
Command line interface for Magphi.
Expand Down Expand Up @@ -79,11 +79,11 @@ def get_commandline_arguments(args, version):
default=True)

# Add flag for tblastn option (translated protein seed search for nucleotide sequences)
parser.add_argument("-p", "--protein_seed",
help="to use tblastn instead of blastn when protein seeds are supplied - useful for hits across diverse genomes",
required=False,
action="store_true",
default=False)
parser.add_argument("-p", "--protein_seed",
help="to use tblastn instead of blastn when protein seeds are supplied - useful for hits across diverse genomes",
required=False,
action="store_true",
default=False)

# Add the flag for the output folder
parser.add_argument('-o',
Expand Down
16 changes: 11 additions & 5 deletions Magphi/search_insertion_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def tblastn_insertion_site(seeds, genome_file, tmp_name):

return blast_out_xml


def blast_insertion_site(seeds, genome_file, tmp_name):
"""
Function to BLAST seed sequences against a given genome
Expand Down Expand Up @@ -812,8 +813,8 @@ def extract_seqs_n_annots(merged_bed_files, file_type, genome_file, annotation_f


def screen_genome_for_seeds(genome_file, seed_pairs, seed_path, tmp_folder,
include_seeds, file_type, out_path, max_seed_dist, file_logger,
is_input_gzipped, print_seq_out, proteins):
include_seeds, file_type, out_path, max_seed_dist, file_logger,
is_input_gzipped, print_seq_out, proteins):
"""
Function that summarise the search of seeds sequences, determination of position and extraction.
:param genome_file: Path to genome to be searched for seed sequences
Expand All @@ -828,6 +829,7 @@ def screen_genome_for_seeds(genome_file, seed_pairs, seed_path, tmp_folder,
:param file_logger: File to with debug log statements should be written.
:param is_input_gzipped: Bool to tell if the input file is gzipped
:param print_seq_out: Bool to indicate if outputs related to fasta and gff sequences should be given
:param proteins: Boolean indicator if input search seqeunce is proteins
:return: Number of times a seed sequence pairs hit the genome,
number of annotations in the interval found between seed sequences,
name of the genome extracted from,
Expand Down Expand Up @@ -872,16 +874,18 @@ def screen_genome_for_seeds(genome_file, seed_pairs, seed_path, tmp_folder,

# run blastn or tblastn depending on --protein flag present
if proteins:
file_logger.debug('\tUsing tblastn')
blast_xml_output = tblastn_insertion_site(seed_path, genome_file, f'{genome_name}_blast')
else:
file_logger.debug('\tUsing regular blast')
blast_xml_output = blast_insertion_site(seed_path, genome_file, f'{genome_name}_blast')

# Construct a bedfile from blast output
file_logger.debug(f"\tConstructing bed file: {genome_file}")
blast_hit_beds, exclude_seed_list, seed_hits = blast_out_to_sorted_bed(blast_xml_output,
include_seeds,
genome_name,
seed_pairs)
include_seeds,
genome_name,
seed_pairs)

# Examine the seed hits and try to find solutions when multiple seeds hit at once
file_logger.debug(f"\tChecking seed sequence placement: {genome_file}")
Expand All @@ -893,6 +897,8 @@ def screen_genome_for_seeds(genome_file, seed_pairs, seed_path, tmp_folder,
merged_bed_files, seed_evidence = bed_merge_handling(blast_hit_beds, include_seeds, exclude_seed_list,
max_seed_dist, seed_evidence)

# TODO - Find a way to fix the orientation of connected segments based on the first primer for better use of the output.

# Extract sequences and annotations using merged intervals.
file_logger.debug(f"\tExtracting sequences from intervals: {genome_file}")
annots_per_interval, break_seeds, seed_evidence, inter_seed_dist = extract_seqs_n_annots(merged_bed_files,
Expand Down
16 changes: 16 additions & 0 deletions functional_tests/Magphi-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,22 @@ test_output_file test_out_folder/master_seed_evidence.csv larger_Magphi_test/mas
rm -r test_out_folder


# Test input seeds being proteins and the tblastn function
call_new_test "Test input seeds being proteins and the tblastn function - with correct -p flag"
Magphi -g tblasn_test/tblastn_genome.fasta -s tblasn_test/prot_seeds.fasta -o test_out_folder -md 140 -p
test_output_file test_out_folder/master_seed_evidence.csv tblastn_simple_expected/master_seed_evidence.expected
test_output_file test_out_folder/inter_seed_distance.csv tblastn_simple_expected/inter_seed_distance.expected
test_output_file test_out_folder/contig_hit_matrix.csv tblastn_simple_expected/master_seed_evidence.expected
rm -r test_out_folder

# Test input seeds being proteins and the tblastn function
call_new_test "Test input seeds being proteins and the tblastn function - No -p flag"
Magphi -g tblasn_test/tblastn_genome.fasta -s tblasn_test/prot_seeds.fasta -o test_out_folder -md 140
test_output_file test_out_folder/master_seed_evidence.csv tblastn_simple_expected/master_seed_evidence.expected
test_output_file test_out_folder/inter_seed_distance.csv tblastn_simple_expected/inter_seed_distance.expected
test_output_file test_out_folder/contig_hit_matrix.csv tblastn_simple_expected/master_seed_evidence.expected
rm -r test_out_folder


# 3. End of testing - check if any errors occurrred
if [ "$num_errors" -gt 0 ]; then
Expand Down
4 changes: 3 additions & 1 deletion functional_tests/test_data/no_input.expected
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
usage: Magphi [-h] -g .fa/.gff [.fa/.gff ...] -s multi_fasta_file.fa [-is]
[-md int] [-b | -n] [-o path/to/output] [-c int] [-l | -q]
[-md int] [-b | -n] [-p] [-o path/to/output] [-c int] [-l | -q]
[--check] [-v]

Welcome to Magphi! This program will extract sequences and possible
Expand All @@ -26,6 +26,8 @@ optional arguments:
-n, --no_sequences Argument to not print outputs related to annotations
or sequences found between seeds [default: sequences
are printed]
-p, --protein_seed to use tblastn instead of blastn when protein seeds
are supplied - useful for hits across diverse genomes
-o path/to/output, --output_folder path/to/output
Give path to output folder [default: current folder]
-c int, --cpu int Give max number of CPUs [default: 1]
Expand Down
4 changes: 4 additions & 0 deletions functional_tests/test_data/tblasn_test/prot_seeds.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>prot_seed_1
MSLPNCLNCQSEYVYEDGHLLICPMCSFEWTPGQEEEHSEQIEVRDSNGNLLSDGDTVTVIKDLKVKGAPKDLKQGTRVKNIRLVEGDHNIDCKIDGFGAMKLKSEFVKKI
>prot_seed_2
MEFQKELIKLNQSFTNYEEAIRFCGQLLYHFDYVEEDYIQAMVDRNNELSVYMGNFIAIPHGTDAAKEKVKKSGLVVVQVPDGVNFGTEEKPQIATVLFGIAGIGDEHLQLIQKISIFCADVDNVIKLADAKTSEEVIRLLNSVE
22 changes: 22 additions & 0 deletions functional_tests/test_data/tblasn_test/tblastn_genome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
>tblastn_genome
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
TATGTCATTACCAAATTGTTTAAACTGTCAGTCAGAATATGTTTATGAAGATGGACATCT
ATTAATTTGTCCTATGTGTTCATTTGAATGGACTCCAGGCCAAGAAGAAGAGCATTCAGA
ACAAATAGAAGTTCGTGATAGTAATGGTAATCTTCTATCAGATGGAGATACAGTAACTGT
TATTAAAGATTTGAAAGTAAAAGGCGCTCCAAAAGACCTGAAACAAGGGACTCGTGTTAA
AAATATTCGCTTGGTGGAAGGTGATCATAACATTGATTGTAAGATTGATGGGTTTGGCGC
GATGAAATTGAAATCAGAATTTGTCAAGAAAATCTAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
ATGGAATTTCAAAAAGAATTAATCAAATTGAATCAATCCTTCACTAATTATGAAGAAGCC
ATTCGTTTTTGTGGTCAACTTCTTTATCATTTTGATTATGTTGAAGAAGATTATATTCAA
GCAATGGTAGATCGAAATAATGAACTCTCTGTTTATATGGGGAACTTTATTGCGATACCA
CATGGAACAGATGCAGCAAAAGAAAAAGTCAAAAAATCAGGGCTAGTTGTTGTTCAAGTC
CCAGATGGTGTTAATTTTGGAACTGAAGAAAAACCTCAAATTGCAACAGTTTTATTTGGA
ATCGCAGGCATTGGAGATGAACACCTTCAGTTGATTCAAAAAATATCCATTTTCTGCGCT
GATGTTGATAATGTTATCAAACTAGCTGATGCTAAAACAAGTGAAGAAGTGATTAGATTA
CTCAATTCTGTTGAGTAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
genome,prot_seed
tblastn_genome,2
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
genome,prot_seed
tblastn_genome,86
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
genome,prot_seed
tblastn_genome,5B
28 changes: 27 additions & 1 deletion unit_tests/Magphi_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
try:
os.chdir('/Magphi/unit_tests/unit_test_data/')
except FileNotFoundError:
os.chdir('../unit_test_data/')
os.chdir('../unit_tests/unit_test_data/')


class TestExitWithError(unittest.TestCase):
Expand Down Expand Up @@ -195,6 +195,32 @@ def test_non_gzipped_files(self):
self.assertEqual(False, check_inputs.check_if_gzip(files, self.logger))


class TestSeedRecognition(unittest.TestCase):
''' Test that seeds of invalid, nucleotide, and protein origin can be recognized when inputted '''
@classmethod
def setUpClass(cls):
cls.logger = logging.getLogger('test_logger.log')
cls.logger.setLevel(logging.INFO)

def test_faulty_seeds(self):
input_seed_file = 'TestSeedRecognition/invalid_seeds.fasta'

with self.assertRaises(SystemExit):
check_inputs.check_seed_type(input_seed_file, self.logger)

def test_protein_seeds(self):
input_seed_file = 'TestSeedRecognition/amino_acid_seeds.fasta'
return_value = check_inputs.check_seed_type(input_seed_file, self.logger)

self.assertEqual(True, return_value)

def test_nucleotide_seeds(self):
input_seed_file = 'TestSeedRecognition/nucleotide_seeds.fasta'
return_value = check_inputs.check_seed_type(input_seed_file, self.logger)

self.assertEqual(return_value, False)


class TestSplittingGff(unittest.TestCase):
def test_gff_split_single_file(self):
''' test the function that splits a gff file into annotations and genome. Assess the number of lines in output '''
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>prot_fasta_1
ITQPITCCCITMPAIDCITIALATG
>prot_fasta_2
istnetkvmlkcwhnelidgqysti
dgqtkvkcw
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>invalid_seed_1
Ewiteroingtlwkrmg
>invalid_seed_2
Ewiteroingtlwkrmg
xxxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>prot_fasta_1
ITQPITCCCITMPAIDCITIALATG
>prot_fasta_2
istnetkvmlkcwhnelidgqysti
>nuc_fasta_1
GATCTATGTCATCGATCGATCTCGG
>nuc_fasta_2
actgtgcgctaggatcgagctatcg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>nuc_fasta_1
GATCTATGTCATCGATCGATCTCGG
>nuc_fasta_2
actgtgcgctaggatcgagctatcg

0 comments on commit 0c4df86

Please sign in to comment.