Skip to content

Commit

Permalink
Add in functions to fix the start site to be a seed. No functional tests
Browse files Browse the repository at this point in the history
  • Loading branch information
milnus committed Nov 8, 2022
1 parent 95b4096 commit 45b8050
Show file tree
Hide file tree
Showing 36 changed files with 559 additions and 705 deletions.
17 changes: 7 additions & 10 deletions Magphi/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,20 +177,17 @@ def main():
cmd_args.protein_seed = protein_seed_check

# construct a temporary folder to hold files
# TODO - Use the standard pythonic way of constructing temporary dicts! - See Corekaburra
file_logger.debug("Construct temporary folder")
tmp_folder = TemporaryDirectory()
# tmp_folder = os.path.join(cmd_args.out_path, "Magphi_tmp_folder")
# try:
# os.mkdir(tmp_folder)
# file_logger.debug("Output folder construction successful")
# except FileExistsError:
# file_logger.warning("A temporary folder already exists at the given output location. "
# "Most likely from an incomplete analysis")

# Read in and combine seeds into pairs
file_logger.debug("Start handling of input seed sequences")
seed_pairs = handle_seeds(cmd_args.seeds, file_logger)
# Find first seed if orienting by seed
if cmd_args.orient_by_seed:
first_seeds = [seed_list[0] for seed_list in seed_pairs.values()]
else:
first_seeds = []

print_seq_out = 'output' if cmd_args.no_seqs else 'None'
if print_seq_out == 'output':
Expand All @@ -203,7 +200,6 @@ def main():
seeds_w_breaks = seed_pairs.copy()
master_inter_seed_dist = {}

# num_genomes = len(genomes)
num_genomes = len(cmd_args.genomes)
# Calculate when to log progress:
progress_num = num_genomes / 10
Expand All @@ -215,7 +211,8 @@ def main():
with concurrent.futures.ThreadPoolExecutor(max_workers=cmd_args.cpu) as executor:
results = [executor.submit(screen_genome_for_seeds, cmd_args.genomes[i], seed_pairs, cmd_args.seeds,
tmp_folder.name, cmd_args.include_seeds, file_type,
cmd_args.out_path, cmd_args.max_seed_dist, file_logger, is_input_gzipped, print_seq_out, cmd_args.protein_seed)
cmd_args.out_path, cmd_args.max_seed_dist, file_logger, is_input_gzipped,
print_seq_out, cmd_args.protein_seed, cmd_args.orient_by_seed, first_seeds)
for i, genome in enumerate(cmd_args.genomes)]

for f in concurrent.futures.as_completed(results):
Expand Down
22 changes: 15 additions & 7 deletions Magphi/commandline_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def get_commandline_arguments(args, version):
metavar='int',
dest='max_seed_dist')

parser.add_argument("-p", "--protein_seed",
help="to use tblastn instead of blastn when protein seeds are supplied - useful for hits across diverse genomes",
required=False,
action="store_true",
default=False)

output_amount = parser.add_mutually_exclusive_group()
output_amount.add_argument('-b',
'--print_breaks',
Expand All @@ -77,13 +83,15 @@ def get_commandline_arguments(args, version):
required=False,
action='store_false',
default=True)

# Add flag for tblastn option (translated protein seed search for nucleotide sequences)
parser.add_argument("-p", "--protein_seed",
help="to use tblastn instead of blastn when protein seeds are supplied - useful for hits across diverse genomes",
required=False,
action="store_true",
default=False)

output_amount.add_argument('-S',
'--stop_orientation',
help='Argument to NOT reorient output sequences and annotations by first seed in pair (Only for connected seed not contig breaks)'
' [default: sequences and annotations are oriented]',
dest='orient_by_seed',
required=False,
action='store_false',
default=True)

# Add the flag for the output folder
parser.add_argument('-o',
Expand Down
183 changes: 158 additions & 25 deletions Magphi/search_insertion_sites.py

Large diffs are not rendered by default.

53 changes: 27 additions & 26 deletions functional_tests/Magphi-test.sh

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions functional_tests/test_data/no_input.expected
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
usage: Magphi [-h] -g .fa/.gff [.fa/.gff ...] -s multi_fasta_file.fa [-is]
[-md int] [-b | -n] [-p] [-o path/to/output] [-c int] [-l | -q]
[--check] [-v]
[-md int] [-p] [-b | -n | -S] [-o path/to/output] [-c int]
[-l | -q] [--check] [-v]

Welcome to Magphi! This program will extract sequences and possible
annotations within a set of seed sequences given as input.
Expand All @@ -21,13 +21,18 @@ optional arguments:
This can often be set a bit higher than an expected
size of a region If no maximum distance is wanted then
set to 0 [default: 50,000bp]
-p, --protein_seed to use tblastn instead of blastn when protein seeds
are supplied - useful for hits across diverse genomes
-b, --print_breaks Argument to print outputs when seeds are next to
contig breaks [default: sequences are not printed]
-n, --no_sequences Argument to not print outputs related to annotations
or sequences found between seeds [default: sequences
are printed]
-p, --protein_seed to use tblastn instead of blastn when protein seeds
are supplied - useful for hits across diverse genomes
-S, --stop_orientation
Argument to NOT reorient output sequences and
annotations by first seed in pair (Only for connected
seed not contig breaks) [default: sequences and
annotations are oriented]
-o path/to/output, --output_folder path/to/output
Give path to output folder [default: current folder]
-c int, --cpu int Give max number of CPUs [default: 1]
Expand Down
189 changes: 157 additions & 32 deletions unit_tests/Magphi_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import json
from shutil import copyfile
import logging
import pybedtools as bedtools

from Magphi import check_inputs
from Magphi import split_gff_file
Expand Down Expand Up @@ -334,43 +335,56 @@ def test_non_matching_seed_names(self):
# seed_pairs = seed_pairs)
# self.assertEqual(result,blast_xml)'''

# TODO - test blast_out_to_sorted_bed function - use an input file of blast xml output - use two sets of seeds
# Test both inclusion and exclution of seeds. - Andrew's responsitibily.
# - We want to test that a blast output is converted correctly to Bed format.
# 1. Produce mock fasta to blast against. (Should have known sites that seeds match. Maybe repeat single Base or gap with seeds being unique)
# 2. Produce mock seeds
# 3. Blast mock seeds against mock fasta using similar settings as Magphi
# 4. Manually curate the positions are as expected
# 5. Manually determine the expected bed file information
# 6. Convert expected bed file format into .json or staight python code to be used for assertion.
# 7. write test.
# 8. Run

class TestBlastOutToSortedBed(unittest.TestCase):

def test_make_bed_list(self):
''' test that the blast xml is correctly being converted to a bed file '''
blast_xml = 'TestBlastOutToSortedBed/Mock_blast_out.xml'
bed_file = 'TestBlastOutToSortedBed/Mock_fasta~~seed.bed'
bed_list=open(bed_file,'r')
genome_name = 'Mock_fasta'
seed_pairs = {'seed': ['seed_1', 'seed_2']}
blast_hit_beds_file, exclusion_list, seed_hits = search_insertion_sites.blast_out_to_sorted_bed(blast_xml_output = blast_xml,
def test_make_bed_list(self):
''' test that the blast xml is correctly being converted to a bed file '''
blast_xml = 'TestBlastOutToSortedBed/Mock_blast_out.xml'
bed_file = 'TestBlastOutToSortedBed/Mock_fasta~~seed.bed'
bed_list=open(bed_file,'r')
genome_name = 'Mock_fasta'
seed_pairs = {'seed': ['seed_1', 'seed_2']}
blast_hit_beds_file, exclusion_list, seed_hits = search_insertion_sites.blast_out_to_sorted_bed(blast_xml_output = blast_xml,
include_seeds = True,
genome_name = genome_name,
seed_pairs = seed_pairs)
blast_hit_beds = open(blast_hit_beds_file[0],'r')
blast_hits_list = blast_hit_beds.readlines()
expected_seed_hits = {'seed': 2}
expected_exclusions = []
expected_blast_hit_beds = bed_list.readlines()
self.assertEqual(expected_seed_hits, seed_hits)
self.assertEqual(expected_exclusions, exclusion_list)
self.assertEqual(expected_blast_hit_beds, blast_hits_list)

# close the files
bed_list.close()
blast_hit_beds.close()
blast_hit_beds = open(blast_hit_beds_file[0], 'r')
blast_hits_list = blast_hit_beds.readlines()
expected_seed_hits = {'seed': 2}
expected_exclusions = []
expected_blast_hit_beds = bed_list.readlines()
self.assertEqual(expected_seed_hits, seed_hits)
self.assertEqual(expected_exclusions, exclusion_list)
self.assertEqual(expected_blast_hit_beds, blast_hits_list)

# close the files
bed_list.close()
blast_hit_beds.close()

def test_strandedness_identification(self):
''' Test the capture of strandedness based on the hsp_hit-frame value in the XML '''
blast_xml = 'TestBlastOutToSortedBed/Mock_blast_out_diff_dir.xml'
bed_file = 'TestBlastOutToSortedBed/Mock_fasta~~seed_diff_dir.bed'
bed_list = open(bed_file, 'r')
genome_name = 'Mock_fasta'
seed_pairs = {'seed': ['seed_1', 'seed_2']}
blast_hit_beds_file, exclusion_list, seed_hits = search_insertion_sites.blast_out_to_sorted_bed(
blast_xml_output=blast_xml,
include_seeds=True,
genome_name=genome_name,
seed_pairs=seed_pairs)
blast_hit_beds = open(blast_hit_beds_file[0], 'r')
blast_hits_list = blast_hit_beds.readlines()
expected_seed_hits = {'seed': 2}
expected_exclusions = []
expected_blast_hit_beds = bed_list.readlines()
self.assertEqual(expected_seed_hits, seed_hits)
self.assertEqual(expected_exclusions, exclusion_list)
self.assertEqual(expected_blast_hit_beds, blast_hits_list)

# close the files
bed_list.close()
blast_hit_beds.close()


class TestSeedsPlacement(unittest.TestCase):
Expand Down Expand Up @@ -1067,6 +1081,117 @@ def test_merge_with_seed_on_contig_edge_exclude_seed(self):
os.remove('TestBedMergeHandling/double_contig~~primer_edge_placement_merged.bed')


class TestReverseComplementDetection(unittest.TestCase):
def test_no_orientation_needed(self):
bed_object = bedtools.BedTool('TestReverseComplementDetection/No_orientation_needed.bed')

seed_to_orient_by = ['seed_1']

return_values = search_insertion_sites.orientation_detector(bed_object, seed_to_orient_by)

self.assertEqual((False, False), return_values)

def test_reverse_sequence(self):
bed_object = bedtools.BedTool('TestReverseComplementDetection/reverse_orientation_needed.bed')

seed_to_orient_by = ['seed_1']

return_values = search_insertion_sites.orientation_detector(bed_object, seed_to_orient_by)

self.assertEqual((False, True), return_values)

def test_reverse_complement_sequence_1(self):
bed_object = bedtools.BedTool('TestReverseComplementDetection/reverse_complement_needed_1.bed')

seed_to_orient_by = ['seed_1']

return_values = search_insertion_sites.orientation_detector(bed_object, seed_to_orient_by)

self.assertEqual((True, True), return_values)

def test_reverse_complement_sequence_2(self):
bed_object = bedtools.BedTool('TestReverseComplementDetection/reverse_complement_needed_2.bed')

seed_to_orient_by = ['seed_1']

return_values = search_insertion_sites.orientation_detector(bed_object, seed_to_orient_by)

self.assertEqual((True, True), return_values)

def test_complement_sequence_1(self):
bed_object = bedtools.BedTool('TestReverseComplementDetection/complement_needed_1.bed')

seed_to_orient_by = ['seed_1']

return_values = search_insertion_sites.orientation_detector(bed_object, seed_to_orient_by)

self.assertEqual((True, False), return_values)

def test_complement_sequence_2(self):
bed_object = bedtools.BedTool('TestReverseComplementDetection/complement_needed_2.bed')

seed_to_orient_by = ['seed_1']

return_values = search_insertion_sites.orientation_detector(bed_object, seed_to_orient_by)

self.assertEqual((True, False), return_values)


class TestMakingOutputOrientationChanges(unittest.TestCase):
def test_fasta_reverse(self):
returned_list = search_insertion_sites.make_output_orientation((False, True), 'TestMakingOutputOrientationChanges/Test_genome.fasta', file_type='fasta')

with open('TestMakingOutputOrientationChanges/Test_genome_reversed.fasta', 'r') as expected_file:
expected_list = expected_file.readlines()

self.assertEqual(expected_list, returned_list)

def test_fasta_complement(self):
returned_list = search_insertion_sites.make_output_orientation((True, False), 'TestMakingOutputOrientationChanges/Test_genome.fasta', file_type='fasta')

with open('TestMakingOutputOrientationChanges/Test_genome_complemented.fasta', 'r') as expected_file:
expected_list = expected_file.readlines()

self.assertEqual(expected_list, returned_list)

def test_fasta_reverse_complement(self):
returned_list = search_insertion_sites.make_output_orientation((True, True), 'TestMakingOutputOrientationChanges/Test_genome.fasta', file_type='fasta')

with open('TestMakingOutputOrientationChanges/Test_genome_reversed_complemented.fasta', 'r') as expected_file:
expected_list = expected_file.readlines()

self.assertEqual(expected_list, returned_list)

def test_gff_reverse(self):
returned_list = search_insertion_sites.make_output_orientation((False, True),
'TestMakingOutputOrientationChanges/Test_genome_in_reversed.gff',
file_type='gff')

with open('TestMakingOutputOrientationChanges/Test_genome_reversed.gff', 'r') as expected_file:
expected_list = expected_file.readlines()

self.assertEqual(expected_list, returned_list)

def test_gff_complement(self):
returned_list = search_insertion_sites.make_output_orientation((True, False),
'TestMakingOutputOrientationChanges/Test_genome_in_complemented.gff',
file_type='gff')

with open('TestMakingOutputOrientationChanges/Test_genome_complemented.gff', 'r') as expected_file:
expected_list = expected_file.readlines()

self.assertEqual(expected_list, returned_list)

def test_gff_reverse_complement(self):
returned_list = search_insertion_sites.make_output_orientation((True, True),
'TestMakingOutputOrientationChanges/Test_genome_in_reverse_complemented.gff',
file_type='gff')

with open('TestMakingOutputOrientationChanges/Test_genome_reverse_complemented.gff', 'r') as expected_file:
expected_list = expected_file.readlines()

self.assertEqual(expected_list, returned_list)

class TestExtractSeqsNAnnots(unittest.TestCase):
def test_same_contig_fasta_extraction(self):
''' Test extraction of fasta sequence given a fasta file with region of interest on single same contig. '''
Expand Down
2 changes: 2 additions & 0 deletions unit_tests/unit_test_data/Mock_fasta~~seed.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Mock_fasta 0 3495 seed_1 . +
Mock_fasta 21084 22674 seed_2 . -

This file was deleted.

Loading

0 comments on commit 45b8050

Please sign in to comment.