From 4279b66fe17030c789585b1e4834d37bb2a3a84c Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 24 May 2022 11:40:34 -0700 Subject: [PATCH] Move VCF functions to io, add io_support for shell_command_runner These VCF functions are specifically IO-related utilities, so moving them to a better scope. --- augur/align.py | 3 +- augur/filter.py | 4 +- augur/index.py | 3 +- augur/io.py | 168 ++++++++++++++++++ .../shell_command_runner.py | 0 augur/mask.py | 4 +- augur/translate.py | 3 +- augur/tree.py | 4 +- augur/utils.py | 158 ---------------- .../test_shell_command_runner.py | 4 +- tests/test_io.py | 73 ++++++++ tests/test_utils.py | 71 -------- 12 files changed, 254 insertions(+), 241 deletions(-) rename augur/{util_support => io_support}/shell_command_runner.py (100%) rename tests/{util_support => io_support}/test_shell_command_runner.py (94%) diff --git a/augur/align.py b/augur/align.py index d8d24cb3a..9a27c92f4 100644 --- a/augur/align.py +++ b/augur/align.py @@ -6,7 +6,8 @@ from shutil import copyfile import numpy as np from Bio import AlignIO, SeqIO, Seq, Align -from .utils import run_shell_command, nthreads_value, shquote +from .io import run_shell_command, shquote +from .utils import nthreads_value from collections import defaultdict class AlignmentError(Exception): diff --git a/augur/filter.py b/augur/filter.py index eac70979a..4941f6695 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -19,8 +19,8 @@ from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates from .index import index_sequences, index_vcf -from .io import open_file, read_metadata, read_sequences, write_sequences -from .utils import AugurError, is_vcf as filename_is_vcf, write_vcf, read_strains +from .io import open_file, read_metadata, read_sequences, write_sequences, is_vcf as filename_is_vcf, write_vcf +from .utils import AugurError, read_strains comment_char = '#' diff --git a/augur/index.py b/augur/index.py index 7db596a42..7abb53160 100644 --- a/augur/index.py +++ b/augur/index.py @@ -8,8 +8,7 @@ import sys import csv -from .io import open_file, read_sequences -from .utils import is_vcf, read_vcf +from .io import open_file, read_sequences, is_vcf, read_vcf def register_arguments(parser): diff --git a/augur/io.py b/augur/io.py index 5abc8151c..dd6d42469 100644 --- a/augur/io.py +++ b/augur/io.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 """Interfaces for reading and writing data also known as input/output (I/O) """ +import os +import shlex import Bio.SeqIO import Bio.SeqRecord import sys @@ -9,6 +11,11 @@ from pathlib import Path from xopen import xopen +from augur.io_support.shell_command_runner import ShellCommandRunner + + +shquote = shlex.quote + @contextmanager def open_file(path_or_buffer, mode="r", **kwargs): @@ -199,3 +206,164 @@ def write_sequences(sequences, path_or_buffer, format="fasta"): def print_err(*args): print(*args, file=sys.stderr) + + +def is_vcf(filename): + """Convenience method to check if a file is a vcf file. + + >>> is_vcf(None) + False + >>> is_vcf("./foo") + False + >>> is_vcf("./foo.vcf") + True + >>> is_vcf("./foo.vcf.GZ") + True + """ + return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz')) + + +def read_vcf(filename): + if filename.lower().endswith(".gz"): + import gzip + file = gzip.open(filename, mode="rt", encoding='utf-8') + else: + file = open(filename, encoding='utf-8') + + chrom_line = next(line for line in file if line.startswith("#C")) + file.close() + headers = chrom_line.strip().split("\t") + sequences = headers[headers.index("FORMAT") + 1:] + + # because we need 'seqs to remove' for VCF + return sequences, sequences.copy() + + +def write_vcf(input_filename, output_filename, dropped_samps): + if _filename_gz(input_filename): + input_arg = "--gzvcf" + else: + input_arg = "--vcf" + + if _filename_gz(output_filename): + output_pipe = "| gzip -c" + else: + output_pipe = "" + + drop_args = ["--remove-indv " + shquote(s) for s in dropped_samps] + + call = ["vcftools"] + drop_args + [input_arg, shquote(input_filename), "--recode --stdout", output_pipe, ">", shquote(output_filename)] + + print("Filtering samples using VCFTools with the call:") + print(" ".join(call)) + run_shell_command(" ".join(call), raise_errors = True) + # remove vcftools log file + try: + os.remove('out.log') + except OSError: + pass + + +def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name): + """ + Writes out a VCF-style file (which seems to be minimally handleable + by vcftools and pyvcf) of the AA differences between sequences and the reference. + This is a similar format created/used by read_in_vcf except that there is one + of these dicts (with sequences, reference, positions) for EACH gene. + + Also writes out a fasta of the reference alignment. + + EBH 12 Dec 2017 + """ + import numpy as np + + #for the header + seqNames = list(prot_dict[list(prot_dict.keys())[0]]['sequences'].keys()) + + #prepare the header of the VCF & write out + header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames + with open(vcf_file_name, 'w', encoding='utf-8') as the_file: + the_file.write( "##fileformat=VCFv4.2\n"+ + "##source=NextStrain_Protein_Translation\n"+ + "##FORMAT=\n") + the_file.write("\t".join(header)+"\n") + + refWrite = [] + vcfWrite = [] + + #go through for every gene/protein + for fname, prot in prot_dict.items(): + sequences = prot['sequences'] + ref = prot['reference'] + positions = prot['positions'] + + #write out the reference fasta + refWrite.append(">"+fname) + refWrite.append(ref) + + #go through every variable position + #There are no deletions here, so it's simpler than for VCF nuc sequenes! + for pi in positions: + pos = pi+1 #change numbering to match VCF not python + refb = ref[pi] #reference base at this position + + #try/except is (much) faster than list comprehension! + pattern = [] + for k,v in sequences.items(): + try: + pattern.append(sequences[k][pi]) + except KeyError: + pattern.append('.') + pattern = np.array(pattern) + + #get the list of ALTs - minus any '.'! + uniques = np.unique(pattern) + uniques = uniques[np.where(uniques!='.')] + + #Convert bases to the number that matches the ALT + j=1 + for u in uniques: + pattern[np.where(pattern==u)[0]] = str(j) + j+=1 + #Now convert these calls to #/# (VCF format) + calls = [ j+"/"+j if j!='.' else '.' for j in pattern ] + if len(uniques)==0: + print("UNEXPECTED ERROR WHILE CONVERTING TO VCF AT POSITION {}".format(str(pi))) + break + + #put it all together and write it out + output = [fname, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls + + vcfWrite.append("\t".join(output)) + + #write it all out + with open(ref_file_name, 'w', encoding='utf-8') as the_file: + the_file.write("\n".join(refWrite)) + + with open(vcf_file_name, 'a', encoding='utf-8') as the_file: + the_file.write("\n".join(vcfWrite)) + + if vcf_file_name.lower().endswith('.gz'): + import os + #must temporarily remove .gz ending, or gzip won't zip it! + os.rename(vcf_file_name, vcf_file_name[:-3]) + call = ["gzip", vcf_file_name[:-3]] + run_shell_command(" ".join(call), raise_errors = True) + + +def run_shell_command(cmd, raise_errors=False, extra_env=None): + """ + Run the given command string via Bash with error checking. + + Returns True if the command exits normally. Returns False if the command + exits with failure and "raise_errors" is False (the default). When + "raise_errors" is True, exceptions are rethrown. + + If an *extra_env* mapping is passed, the provided keys and values are + overlayed onto the default subprocess environment. + """ + return ShellCommandRunner(cmd, raise_errors=raise_errors, extra_env=extra_env).run() + + +def _filename_gz(filename): + return filename.lower().endswith(".gz") diff --git a/augur/util_support/shell_command_runner.py b/augur/io_support/shell_command_runner.py similarity index 100% rename from augur/util_support/shell_command_runner.py rename to augur/io_support/shell_command_runner.py diff --git a/augur/mask.py b/augur/mask.py index b52232d7d..afa02846f 100644 --- a/augur/mask.py +++ b/augur/mask.py @@ -10,8 +10,8 @@ from Bio import SeqIO from Bio.Seq import MutableSeq -from .io import open_file, read_sequences, write_sequences -from .utils import run_shell_command, shquote, is_vcf, load_mask_sites, VALID_NUCLEOTIDES +from .io import open_file, read_sequences, write_sequences, run_shell_command, shquote, is_vcf +from .utils import load_mask_sites, VALID_NUCLEOTIDES def get_chrom_name(vcf_file): """Read the CHROM field from the first non-header line of a vcf file. diff --git a/augur/translate.py b/augur/translate.py index 244b86c12..d2f1293b6 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -5,7 +5,8 @@ import os, sys import numpy as np from Bio import SeqIO, SeqFeature, Seq, SeqRecord, Phylo -from .utils import read_node_data, load_features, write_json, write_VCF_translation, get_json_name +from .io import write_VCF_translation +from .utils import read_node_data, load_features, write_json, get_json_name from treetime.vcf_utils import read_vcf class MissingNodeError(Exception): diff --git a/augur/tree.py b/augur/tree.py index 8b66f0eee..fdd0d48bd 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -15,8 +15,8 @@ from treetime.vcf_utils import read_vcf from pathlib import Path -from .io import read_sequences -from .utils import run_shell_command, nthreads_value, shquote, load_mask_sites +from .io import read_sequences, run_shell_command, shquote +from .utils import nthreads_value, load_mask_sites DEFAULT_ARGS = { "fasttree": "-nt -nosupport", diff --git a/augur/utils.py b/augur/utils.py index f0ac6d7f1..2d0695ef8 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -17,68 +17,12 @@ from augur.util_support.color_parser import ColorParser from augur.util_support.metadata_file import MetadataFile from augur.util_support.node_data_reader import NodeDataReader -from augur.util_support.shell_command_runner import ShellCommandRunner class AugurError(Exception): pass -def is_vcf(filename): - """Convenience method to check if a file is a vcf file. - - >>> is_vcf(None) - False - >>> is_vcf("./foo") - False - >>> is_vcf("./foo.vcf") - True - >>> is_vcf("./foo.vcf.GZ") - True - """ - return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz')) - -def read_vcf(filename): - if filename.lower().endswith(".gz"): - import gzip - file = gzip.open(filename, mode="rt", encoding='utf-8') - else: - file = open(filename, encoding='utf-8') - - chrom_line = next(line for line in file if line.startswith("#C")) - file.close() - headers = chrom_line.strip().split("\t") - sequences = headers[headers.index("FORMAT") + 1:] - - # because we need 'seqs to remove' for VCF - return sequences, sequences.copy() - -def write_vcf(input_filename, output_filename, dropped_samps): - if _filename_gz(input_filename): - input_arg = "--gzvcf" - else: - input_arg = "--vcf" - - if _filename_gz(output_filename): - output_pipe = "| gzip -c" - else: - output_pipe = "" - - drop_args = ["--remove-indv " + shquote(s) for s in dropped_samps] - - call = ["vcftools"] + drop_args + [input_arg, shquote(input_filename), "--recode --stdout", output_pipe, ">", shquote(output_filename)] - - print("Filtering samples using VCFTools with the call:") - print(" ".join(call)) - run_shell_command(" ".join(call), raise_errors = True) - # remove vcftools log file - try: - os.remove('out.log') - except OSError: - pass - -def _filename_gz(filename): - return filename.lower().endswith(".gz") def get_json_name(args, default=None): if args.output_node_data: @@ -302,108 +246,6 @@ def add_line_to_coordinates(line): def read_colors(overrides=None, use_defaults=True): return ColorParser(mapping_filename=overrides, use_defaults=use_defaults).mapping -def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name): - """ - Writes out a VCF-style file (which seems to be minimally handleable - by vcftools and pyvcf) of the AA differences between sequences and the reference. - This is a similar format created/used by read_in_vcf except that there is one - of these dicts (with sequences, reference, positions) for EACH gene. - - Also writes out a fasta of the reference alignment. - - EBH 12 Dec 2017 - """ - import numpy as np - - #for the header - seqNames = list(prot_dict[list(prot_dict.keys())[0]]['sequences'].keys()) - - #prepare the header of the VCF & write out - header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames - with open(vcf_file_name, 'w', encoding='utf-8') as the_file: - the_file.write( "##fileformat=VCFv4.2\n"+ - "##source=NextStrain_Protein_Translation\n"+ - "##FORMAT=\n") - the_file.write("\t".join(header)+"\n") - - refWrite = [] - vcfWrite = [] - - #go through for every gene/protein - for fname, prot in prot_dict.items(): - sequences = prot['sequences'] - ref = prot['reference'] - positions = prot['positions'] - - #write out the reference fasta - refWrite.append(">"+fname) - refWrite.append(ref) - - #go through every variable position - #There are no deletions here, so it's simpler than for VCF nuc sequenes! - for pi in positions: - pos = pi+1 #change numbering to match VCF not python - refb = ref[pi] #reference base at this position - - #try/except is (much) faster than list comprehension! - pattern = [] - for k,v in sequences.items(): - try: - pattern.append(sequences[k][pi]) - except KeyError: - pattern.append('.') - pattern = np.array(pattern) - - #get the list of ALTs - minus any '.'! - uniques = np.unique(pattern) - uniques = uniques[np.where(uniques!='.')] - - #Convert bases to the number that matches the ALT - j=1 - for u in uniques: - pattern[np.where(pattern==u)[0]] = str(j) - j+=1 - #Now convert these calls to #/# (VCF format) - calls = [ j+"/"+j if j!='.' else '.' for j in pattern ] - if len(uniques)==0: - print("UNEXPECTED ERROR WHILE CONVERTING TO VCF AT POSITION {}".format(str(pi))) - break - - #put it all together and write it out - output = [fname, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls - - vcfWrite.append("\t".join(output)) - - #write it all out - with open(ref_file_name, 'w', encoding='utf-8') as the_file: - the_file.write("\n".join(refWrite)) - - with open(vcf_file_name, 'a', encoding='utf-8') as the_file: - the_file.write("\n".join(vcfWrite)) - - if vcf_file_name.lower().endswith('.gz'): - import os - #must temporarily remove .gz ending, or gzip won't zip it! - os.rename(vcf_file_name, vcf_file_name[:-3]) - call = ["gzip", vcf_file_name[:-3]] - run_shell_command(" ".join(call), raise_errors = True) - -shquote = shlex.quote - -def run_shell_command(cmd, raise_errors=False, extra_env=None): - """ - Run the given command string via Bash with error checking. - - Returns True if the command exits normally. Returns False if the command - exits with failure and "raise_errors" is False (the default). When - "raise_errors" is True, exceptions are rethrown. - - If an *extra_env* mapping is passed, the provided keys and values are - overlayed onto the default subprocess environment. - """ - return ShellCommandRunner(cmd, raise_errors=raise_errors, extra_env=extra_env).run() - - def first_line(text): """ Returns the first line of the given text, ignoring leading and trailing diff --git a/tests/util_support/test_shell_command_runner.py b/tests/io_support/test_shell_command_runner.py similarity index 94% rename from tests/util_support/test_shell_command_runner.py rename to tests/io_support/test_shell_command_runner.py index 7b1ea5aa9..ae7c41b02 100644 --- a/tests/util_support/test_shell_command_runner.py +++ b/tests/io_support/test_shell_command_runner.py @@ -2,7 +2,7 @@ import re import subprocess -from augur.util_support.shell_command_runner import ShellCommandRunner +from augur.io_support.shell_command_runner import ShellCommandRunner import pytest @@ -69,7 +69,7 @@ def test_modified_env(self): ) def test_print_error_message(self, mocker, exception, expected_message): mock_print_error = mocker.patch( - "augur.util_support.shell_command_runner.ShellCommandRunner.print_error" + "augur.io_support.shell_command_runner.ShellCommandRunner.print_error" ) ShellCommandRunner("cmd").print_error_message(exception) diff --git a/tests/test_io.py b/tests/test_io.py index 2da0a1bff..c79fa290e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -10,6 +10,7 @@ import random import sys +import augur.io as io from augur.io import open_file, read_sequences, write_sequences @@ -77,6 +78,10 @@ def lzma_fasta_filename(tmpdir, sequences): def genbank_reference(): return "tests/builds/zika/config/zika_outgroup.gb" +@pytest.fixture +def mock_run_shell_command(mocker): + mocker.patch("augur.io.run_shell_command") + class TestReadSequences: def test_read_sequences_from_single_file(self, fasta_filename): @@ -231,3 +236,71 @@ def test_open_file_read_lzma(self, tmpdir): f_write.write('foo\nbar\n') with lzma.open(path, 'rt') as f_read: assert f_read.read() == 'foo\nbar\n' + + +class TestVCF: + def test_read_vcf_compressed(self): + seq_keep, all_seq = io.read_vcf( + "tests/builds/tb/data/lee_2015.vcf.gz" + ) + + assert len(seq_keep) == 150 + assert seq_keep[149] == "G22733" + assert seq_keep == all_seq + + def test_read_vcf_uncompressed(self): + seq_keep, all_seq = io.read_vcf("tests/builds/tb/data/lee_2015.vcf") + + assert len(seq_keep) == 150 + assert seq_keep[149] == "G22733" + assert seq_keep == all_seq + + def test_write_vcf_compressed_input(self, mock_run_shell_command): + io.write_vcf( + "tests/builds/tb/data/lee_2015.vcf.gz", "output_file.vcf.gz", [] + ) + + io.run_shell_command.assert_called_once_with( + "vcftools --gzvcf tests/builds/tb/data/lee_2015.vcf.gz --recode --stdout | gzip -c > output_file.vcf.gz", + raise_errors=True, + ) + + def test_write_vcf_uncompressed_input(self, mock_run_shell_command): + io.write_vcf( + "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf.gz", [] + ) + + io.run_shell_command.assert_called_once_with( + "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout | gzip -c > output_file.vcf.gz", + raise_errors=True, + ) + + def test_write_vcf_compressed_output(self, mock_run_shell_command): + io.write_vcf( + "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf.gz", [] + ) + + io.run_shell_command.assert_called_once_with( + "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout | gzip -c > output_file.vcf.gz", + raise_errors=True, + ) + + def test_write_vcf_uncompressed_output(self, mock_run_shell_command): + io.write_vcf( + "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf", [] + ) + + io.run_shell_command.assert_called_once_with( + "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout > output_file.vcf", + raise_errors=True, + ) + + def test_write_vcf_dropped_samples(self, mock_run_shell_command): + io.write_vcf( + "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf", ["x", "y", "z"] + ) + + io.run_shell_command.assert_called_once_with( + "vcftools --remove-indv x --remove-indv y --remove-indv z --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout > output_file.vcf", + raise_errors=True, + ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 90946df78..05ecd07fb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,11 +7,6 @@ from test_filter import write_metadata -@pytest.fixture -def mock_run_shell_command(mocker): - mocker.patch("augur.utils.run_shell_command") - - class TestUtils: @pytest.mark.parametrize("extension", ["bed","BED"]) @patch('augur.utils.read_bed_file') @@ -111,69 +106,3 @@ def test_read_metadata(self, tmpdir): with pytest.raises(ValueError) as e_info: utils.read_metadata(meta_fn, as_data_frame=True) assert str(e_info.value) == "Duplicated strain in metadata: SEQ_1" - - def test_read_vcf_compressed(self): - seq_keep, all_seq = utils.read_vcf( - "tests/builds/tb/data/lee_2015.vcf.gz" - ) - - assert len(seq_keep) == 150 - assert seq_keep[149] == "G22733" - assert seq_keep == all_seq - - def test_read_vcf_uncompressed(self): - seq_keep, all_seq = utils.read_vcf("tests/builds/tb/data/lee_2015.vcf") - - assert len(seq_keep) == 150 - assert seq_keep[149] == "G22733" - assert seq_keep == all_seq - - def test_write_vcf_compressed_input(self, mock_run_shell_command): - utils.write_vcf( - "tests/builds/tb/data/lee_2015.vcf.gz", "output_file.vcf.gz", [] - ) - - utils.run_shell_command.assert_called_once_with( - "vcftools --gzvcf tests/builds/tb/data/lee_2015.vcf.gz --recode --stdout | gzip -c > output_file.vcf.gz", - raise_errors=True, - ) - - def test_write_vcf_uncompressed_input(self, mock_run_shell_command): - utils.write_vcf( - "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf.gz", [] - ) - - utils.run_shell_command.assert_called_once_with( - "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout | gzip -c > output_file.vcf.gz", - raise_errors=True, - ) - - def test_write_vcf_compressed_output(self, mock_run_shell_command): - utils.write_vcf( - "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf.gz", [] - ) - - utils.run_shell_command.assert_called_once_with( - "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout | gzip -c > output_file.vcf.gz", - raise_errors=True, - ) - - def test_write_vcf_uncompressed_output(self, mock_run_shell_command): - utils.write_vcf( - "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf", [] - ) - - utils.run_shell_command.assert_called_once_with( - "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout > output_file.vcf", - raise_errors=True, - ) - - def test_write_vcf_dropped_samples(self, mock_run_shell_command): - utils.write_vcf( - "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf", ["x", "y", "z"] - ) - - utils.run_shell_command.assert_called_once_with( - "vcftools --remove-indv x --remove-indv y --remove-indv z --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout > output_file.vcf", - raise_errors=True, - )