From 4279b66fe17030c789585b1e4834d37bb2a3a84c Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Tue, 24 May 2022 11:40:34 -0700
Subject: [PATCH] Move VCF functions to io, add io_support for
 shell_command_runner

These VCF functions are specifically IO-related utilities, so moving them to a better scope.
---
 augur/align.py                                |   3 +-
 augur/filter.py                               |   4 +-
 augur/index.py                                |   3 +-
 augur/io.py                                   | 168 ++++++++++++++++++
 .../shell_command_runner.py                   |   0
 augur/mask.py                                 |   4 +-
 augur/translate.py                            |   3 +-
 augur/tree.py                                 |   4 +-
 augur/utils.py                                | 158 ----------------
 .../test_shell_command_runner.py              |   4 +-
 tests/test_io.py                              |  73 ++++++++
 tests/test_utils.py                           |  71 --------
 12 files changed, 254 insertions(+), 241 deletions(-)
 rename augur/{util_support => io_support}/shell_command_runner.py (100%)
 rename tests/{util_support => io_support}/test_shell_command_runner.py (94%)

diff --git a/augur/align.py b/augur/align.py
index d8d24cb3a..9a27c92f4 100644
--- a/augur/align.py
+++ b/augur/align.py
@@ -6,7 +6,8 @@
 from shutil import copyfile
 import numpy as np
 from Bio import AlignIO, SeqIO, Seq, Align
-from .utils import run_shell_command, nthreads_value, shquote
+from .io import run_shell_command, shquote
+from .utils import nthreads_value
 from collections import defaultdict
 
 class AlignmentError(Exception):
diff --git a/augur/filter.py b/augur/filter.py
index eac70979a..4941f6695 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -19,8 +19,8 @@
 
 from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates
 from .index import index_sequences, index_vcf
-from .io import open_file, read_metadata, read_sequences, write_sequences
-from .utils import AugurError, is_vcf as filename_is_vcf, write_vcf, read_strains
+from .io import open_file, read_metadata, read_sequences, write_sequences, is_vcf as filename_is_vcf, write_vcf
+from .utils import AugurError, read_strains
 
 comment_char = '#'
 
diff --git a/augur/index.py b/augur/index.py
index 7db596a42..7abb53160 100644
--- a/augur/index.py
+++ b/augur/index.py
@@ -8,8 +8,7 @@
 import sys
 import csv
 
-from .io import open_file, read_sequences
-from .utils import is_vcf, read_vcf
+from .io import open_file, read_sequences, is_vcf, read_vcf
 
 
 def register_arguments(parser):
diff --git a/augur/io.py b/augur/io.py
index 5abc8151c..dd6d42469 100644
--- a/augur/io.py
+++ b/augur/io.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 """Interfaces for reading and writing data also known as input/output (I/O)
 """
+import os
+import shlex
 import Bio.SeqIO
 import Bio.SeqRecord
 import sys
@@ -9,6 +11,11 @@
 from pathlib import Path
 from xopen import xopen
 
+from augur.io_support.shell_command_runner import ShellCommandRunner
+
+
+shquote = shlex.quote
+
 
 @contextmanager
 def open_file(path_or_buffer, mode="r", **kwargs):
@@ -199,3 +206,164 @@ def write_sequences(sequences, path_or_buffer, format="fasta"):
 
 def print_err(*args):
     print(*args, file=sys.stderr)
+
+
+def is_vcf(filename):
+    """Convenience method to check if a file is a vcf file.
+
+    >>> is_vcf(None)
+    False
+    >>> is_vcf("./foo")
+    False
+    >>> is_vcf("./foo.vcf")
+    True
+    >>> is_vcf("./foo.vcf.GZ")
+    True
+    """
+    return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))
+
+
+def read_vcf(filename):
+    if filename.lower().endswith(".gz"):
+        import gzip
+        file = gzip.open(filename, mode="rt", encoding='utf-8')
+    else:
+        file = open(filename, encoding='utf-8')
+
+    chrom_line = next(line for line in file if line.startswith("#C"))
+    file.close()
+    headers = chrom_line.strip().split("\t")
+    sequences = headers[headers.index("FORMAT") + 1:]
+
+    # because we need 'seqs to remove' for VCF
+    return sequences, sequences.copy()
+
+
+def write_vcf(input_filename, output_filename, dropped_samps):
+    if _filename_gz(input_filename):
+        input_arg = "--gzvcf"
+    else:
+        input_arg = "--vcf"
+
+    if _filename_gz(output_filename):
+        output_pipe = "| gzip -c"
+    else:
+        output_pipe = ""
+
+    drop_args = ["--remove-indv " + shquote(s) for s in dropped_samps]
+
+    call = ["vcftools"] + drop_args + [input_arg, shquote(input_filename), "--recode --stdout", output_pipe, ">", shquote(output_filename)]
+
+    print("Filtering samples using VCFTools with the call:")
+    print(" ".join(call))
+    run_shell_command(" ".join(call), raise_errors = True)
+    # remove vcftools log file
+    try:
+        os.remove('out.log')
+    except OSError:
+        pass
+
+
+def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
+    """
+    Writes out a VCF-style file (which seems to be minimally handleable
+    by vcftools and pyvcf) of the AA differences between sequences and the reference.
+    This is a similar format created/used by read_in_vcf except that there is one
+    of these dicts (with sequences, reference, positions) for EACH gene.
+
+    Also writes out a fasta of the reference alignment.
+
+    EBH 12 Dec 2017
+    """
+    import numpy as np
+
+    #for the header
+    seqNames = list(prot_dict[list(prot_dict.keys())[0]]['sequences'].keys())
+
+    #prepare the header of the VCF & write out
+    header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames
+    with open(vcf_file_name, 'w', encoding='utf-8') as the_file:
+        the_file.write( "##fileformat=VCFv4.2\n"+
+                        "##source=NextStrain_Protein_Translation\n"+
+                        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
+        the_file.write("\t".join(header)+"\n")
+
+    refWrite = []
+    vcfWrite = []
+
+    #go through for every gene/protein
+    for fname, prot in prot_dict.items():
+        sequences = prot['sequences']
+        ref = prot['reference']
+        positions = prot['positions']
+
+        #write out the reference fasta
+        refWrite.append(">"+fname)
+        refWrite.append(ref)
+
+        #go through every variable position
+        #There are no deletions here, so it's simpler than for VCF nuc sequenes!
+        for pi in positions:
+            pos = pi+1 #change numbering to match VCF not python
+            refb = ref[pi] #reference base at this position
+
+            #try/except is (much) faster than list comprehension!
+            pattern = []
+            for k,v in sequences.items():
+                try:
+                    pattern.append(sequences[k][pi])
+                except KeyError:
+                    pattern.append('.')
+            pattern = np.array(pattern)
+
+            #get the list of ALTs - minus any '.'!
+            uniques = np.unique(pattern)
+            uniques = uniques[np.where(uniques!='.')]
+
+            #Convert bases to the number that matches the ALT
+            j=1
+            for u in uniques:
+                pattern[np.where(pattern==u)[0]] = str(j)
+                j+=1
+            #Now convert these calls to #/# (VCF format)
+            calls = [ j+"/"+j if j!='.' else '.' for j in pattern ]
+            if len(uniques)==0:
+                print("UNEXPECTED ERROR WHILE CONVERTING TO VCF AT POSITION {}".format(str(pi)))
+                break
+
+            #put it all together and write it out
+            output = [fname, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls
+
+            vcfWrite.append("\t".join(output))
+
+    #write it all out
+    with open(ref_file_name, 'w', encoding='utf-8') as the_file:
+        the_file.write("\n".join(refWrite))
+
+    with open(vcf_file_name, 'a', encoding='utf-8') as the_file:
+        the_file.write("\n".join(vcfWrite))
+
+    if vcf_file_name.lower().endswith('.gz'):
+        import os
+        #must temporarily remove .gz ending, or gzip won't zip it!
+        os.rename(vcf_file_name, vcf_file_name[:-3])
+        call = ["gzip", vcf_file_name[:-3]]
+        run_shell_command(" ".join(call), raise_errors = True)
+
+
+def run_shell_command(cmd, raise_errors=False, extra_env=None):
+    """
+    Run the given command string via Bash with error checking.
+
+    Returns True if the command exits normally.  Returns False if the command
+    exits with failure and "raise_errors" is False (the default).  When
+    "raise_errors" is True, exceptions are rethrown.
+
+    If an *extra_env* mapping is passed, the provided keys and values are
+    overlayed onto the default subprocess environment.
+    """
+    return ShellCommandRunner(cmd, raise_errors=raise_errors, extra_env=extra_env).run()
+
+
+def _filename_gz(filename):
+    return filename.lower().endswith(".gz")
diff --git a/augur/util_support/shell_command_runner.py b/augur/io_support/shell_command_runner.py
similarity index 100%
rename from augur/util_support/shell_command_runner.py
rename to augur/io_support/shell_command_runner.py
diff --git a/augur/mask.py b/augur/mask.py
index b52232d7d..afa02846f 100644
--- a/augur/mask.py
+++ b/augur/mask.py
@@ -10,8 +10,8 @@
 from Bio import SeqIO
 from Bio.Seq import MutableSeq
 
-from .io import open_file, read_sequences, write_sequences
-from .utils import run_shell_command, shquote, is_vcf, load_mask_sites, VALID_NUCLEOTIDES
+from .io import open_file, read_sequences, write_sequences, run_shell_command, shquote, is_vcf
+from .utils import load_mask_sites, VALID_NUCLEOTIDES
 
 def get_chrom_name(vcf_file):
     """Read the CHROM field from the first non-header line of a vcf file.
diff --git a/augur/translate.py b/augur/translate.py
index 244b86c12..d2f1293b6 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -5,7 +5,8 @@
 import os, sys
 import numpy as np
 from Bio import SeqIO, SeqFeature, Seq, SeqRecord, Phylo
-from .utils import read_node_data, load_features, write_json, write_VCF_translation, get_json_name
+from .io import write_VCF_translation
+from .utils import read_node_data, load_features, write_json, get_json_name
 from treetime.vcf_utils import read_vcf
 
 class MissingNodeError(Exception):
diff --git a/augur/tree.py b/augur/tree.py
index 8b66f0eee..fdd0d48bd 100644
--- a/augur/tree.py
+++ b/augur/tree.py
@@ -15,8 +15,8 @@
 from treetime.vcf_utils import read_vcf
 from pathlib import Path
 
-from .io import read_sequences
-from .utils import run_shell_command, nthreads_value, shquote, load_mask_sites
+from .io import read_sequences, run_shell_command, shquote
+from .utils import nthreads_value, load_mask_sites
 
 DEFAULT_ARGS = {
     "fasttree": "-nt -nosupport",
diff --git a/augur/utils.py b/augur/utils.py
index f0ac6d7f1..2d0695ef8 100644
--- a/augur/utils.py
+++ b/augur/utils.py
@@ -17,68 +17,12 @@
 from augur.util_support.color_parser import ColorParser
 from augur.util_support.metadata_file import MetadataFile
 from augur.util_support.node_data_reader import NodeDataReader
-from augur.util_support.shell_command_runner import ShellCommandRunner
 
 
 class AugurError(Exception):
     pass
 
 
-def is_vcf(filename):
-    """Convenience method to check if a file is a vcf file.
-
-    >>> is_vcf(None)
-    False
-    >>> is_vcf("./foo")
-    False
-    >>> is_vcf("./foo.vcf")
-    True
-    >>> is_vcf("./foo.vcf.GZ")
-    True
-    """
-    return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))
-
-def read_vcf(filename):
-    if filename.lower().endswith(".gz"):
-        import gzip
-        file = gzip.open(filename, mode="rt", encoding='utf-8')
-    else:
-        file = open(filename, encoding='utf-8')
-
-    chrom_line = next(line for line in file if line.startswith("#C"))
-    file.close()
-    headers = chrom_line.strip().split("\t")
-    sequences = headers[headers.index("FORMAT") + 1:]
-
-    # because we need 'seqs to remove' for VCF
-    return sequences, sequences.copy()
-
-def write_vcf(input_filename, output_filename, dropped_samps):
-    if _filename_gz(input_filename):
-        input_arg = "--gzvcf"
-    else:
-        input_arg = "--vcf"
-
-    if _filename_gz(output_filename):
-        output_pipe = "| gzip -c"
-    else:
-        output_pipe = ""
-
-    drop_args = ["--remove-indv " + shquote(s) for s in dropped_samps]
-
-    call = ["vcftools"] + drop_args + [input_arg, shquote(input_filename), "--recode --stdout", output_pipe, ">", shquote(output_filename)]
-
-    print("Filtering samples using VCFTools with the call:")
-    print(" ".join(call))
-    run_shell_command(" ".join(call), raise_errors = True)
-    # remove vcftools log file
-    try:
-        os.remove('out.log')
-    except OSError:
-        pass
-
-def _filename_gz(filename):
-    return filename.lower().endswith(".gz")
 
 def get_json_name(args, default=None):
     if args.output_node_data:
@@ -302,108 +246,6 @@ def add_line_to_coordinates(line):
 def read_colors(overrides=None, use_defaults=True):
     return ColorParser(mapping_filename=overrides, use_defaults=use_defaults).mapping
 
-def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
-    """
-    Writes out a VCF-style file (which seems to be minimally handleable
-    by vcftools and pyvcf) of the AA differences between sequences and the reference.
-    This is a similar format created/used by read_in_vcf except that there is one
-    of these dicts (with sequences, reference, positions) for EACH gene.
-
-    Also writes out a fasta of the reference alignment.
-
-    EBH 12 Dec 2017
-    """
-    import numpy as np
-
-    #for the header
-    seqNames = list(prot_dict[list(prot_dict.keys())[0]]['sequences'].keys())
-
-    #prepare the header of the VCF & write out
-    header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames
-    with open(vcf_file_name, 'w', encoding='utf-8') as the_file:
-        the_file.write( "##fileformat=VCFv4.2\n"+
-                        "##source=NextStrain_Protein_Translation\n"+
-                        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
-        the_file.write("\t".join(header)+"\n")
-
-    refWrite = []
-    vcfWrite = []
-
-    #go through for every gene/protein
-    for fname, prot in prot_dict.items():
-        sequences = prot['sequences']
-        ref = prot['reference']
-        positions = prot['positions']
-
-        #write out the reference fasta
-        refWrite.append(">"+fname)
-        refWrite.append(ref)
-
-        #go through every variable position
-        #There are no deletions here, so it's simpler than for VCF nuc sequenes!
-        for pi in positions:
-            pos = pi+1 #change numbering to match VCF not python
-            refb = ref[pi] #reference base at this position
-
-            #try/except is (much) faster than list comprehension!
-            pattern = []
-            for k,v in sequences.items():
-                try:
-                    pattern.append(sequences[k][pi])
-                except KeyError:
-                    pattern.append('.')
-            pattern = np.array(pattern)
-
-            #get the list of ALTs - minus any '.'!
-            uniques = np.unique(pattern)
-            uniques = uniques[np.where(uniques!='.')]
-
-            #Convert bases to the number that matches the ALT
-            j=1
-            for u in uniques:
-                pattern[np.where(pattern==u)[0]] = str(j)
-                j+=1
-            #Now convert these calls to #/# (VCF format)
-            calls = [ j+"/"+j if j!='.' else '.' for j in pattern ]
-            if len(uniques)==0:
-                print("UNEXPECTED ERROR WHILE CONVERTING TO VCF AT POSITION {}".format(str(pi)))
-                break
-
-            #put it all together and write it out
-            output = [fname, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls
-
-            vcfWrite.append("\t".join(output))
-
-    #write it all out
-    with open(ref_file_name, 'w', encoding='utf-8') as the_file:
-        the_file.write("\n".join(refWrite))
-
-    with open(vcf_file_name, 'a', encoding='utf-8') as the_file:
-        the_file.write("\n".join(vcfWrite))
-
-    if vcf_file_name.lower().endswith('.gz'):
-        import os
-        #must temporarily remove .gz ending, or gzip won't zip it!
-        os.rename(vcf_file_name, vcf_file_name[:-3])
-        call = ["gzip", vcf_file_name[:-3]]
-        run_shell_command(" ".join(call), raise_errors = True)
-
-shquote = shlex.quote
-
-def run_shell_command(cmd, raise_errors=False, extra_env=None):
-    """
-    Run the given command string via Bash with error checking.
-
-    Returns True if the command exits normally.  Returns False if the command
-    exits with failure and "raise_errors" is False (the default).  When
-    "raise_errors" is True, exceptions are rethrown.
-
-    If an *extra_env* mapping is passed, the provided keys and values are
-    overlayed onto the default subprocess environment.
-    """
-    return ShellCommandRunner(cmd, raise_errors=raise_errors, extra_env=extra_env).run()
-
-
 def first_line(text):
     """
     Returns the first line of the given text, ignoring leading and trailing
diff --git a/tests/util_support/test_shell_command_runner.py b/tests/io_support/test_shell_command_runner.py
similarity index 94%
rename from tests/util_support/test_shell_command_runner.py
rename to tests/io_support/test_shell_command_runner.py
index 7b1ea5aa9..ae7c41b02 100644
--- a/tests/util_support/test_shell_command_runner.py
+++ b/tests/io_support/test_shell_command_runner.py
@@ -2,7 +2,7 @@
 import re
 import subprocess
 
-from augur.util_support.shell_command_runner import ShellCommandRunner
+from augur.io_support.shell_command_runner import ShellCommandRunner
 
 import pytest
 
@@ -69,7 +69,7 @@ def test_modified_env(self):
     )
     def test_print_error_message(self, mocker, exception, expected_message):
         mock_print_error = mocker.patch(
-            "augur.util_support.shell_command_runner.ShellCommandRunner.print_error"
+            "augur.io_support.shell_command_runner.ShellCommandRunner.print_error"
         )
 
         ShellCommandRunner("cmd").print_error_message(exception)
diff --git a/tests/test_io.py b/tests/test_io.py
index 2da0a1bff..c79fa290e 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -10,6 +10,7 @@
 import random
 import sys
 
+import augur.io as io
 from augur.io import open_file, read_sequences, write_sequences
 
 
@@ -77,6 +78,10 @@ def lzma_fasta_filename(tmpdir, sequences):
 def genbank_reference():
     return "tests/builds/zika/config/zika_outgroup.gb"
 
+@pytest.fixture
+def mock_run_shell_command(mocker):
+    mocker.patch("augur.io.run_shell_command")
+
 
 class TestReadSequences:
     def test_read_sequences_from_single_file(self, fasta_filename):
@@ -231,3 +236,71 @@ def test_open_file_read_lzma(self, tmpdir):
             f_write.write('foo\nbar\n')
         with lzma.open(path, 'rt') as f_read:
             assert f_read.read() == 'foo\nbar\n'
+
+
+class TestVCF:
+    def test_read_vcf_compressed(self):
+        seq_keep, all_seq = io.read_vcf(
+            "tests/builds/tb/data/lee_2015.vcf.gz"
+        )
+
+        assert len(seq_keep) == 150
+        assert seq_keep[149] == "G22733"
+        assert seq_keep == all_seq
+
+    def test_read_vcf_uncompressed(self):
+        seq_keep, all_seq = io.read_vcf("tests/builds/tb/data/lee_2015.vcf")
+
+        assert len(seq_keep) == 150
+        assert seq_keep[149] == "G22733"
+        assert seq_keep == all_seq
+
+    def test_write_vcf_compressed_input(self, mock_run_shell_command):
+        io.write_vcf(
+            "tests/builds/tb/data/lee_2015.vcf.gz", "output_file.vcf.gz", []
+        )
+
+        io.run_shell_command.assert_called_once_with(
+            "vcftools --gzvcf tests/builds/tb/data/lee_2015.vcf.gz --recode --stdout | gzip -c > output_file.vcf.gz",
+            raise_errors=True,
+        )
+
+    def test_write_vcf_uncompressed_input(self, mock_run_shell_command):
+        io.write_vcf(
+            "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf.gz", []
+        )
+
+        io.run_shell_command.assert_called_once_with(
+            "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout | gzip -c > output_file.vcf.gz",
+            raise_errors=True,
+        )
+
+    def test_write_vcf_compressed_output(self, mock_run_shell_command):
+        io.write_vcf(
+            "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf.gz", []
+        )
+
+        io.run_shell_command.assert_called_once_with(
+            "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout | gzip -c > output_file.vcf.gz",
+            raise_errors=True,
+        )
+
+    def test_write_vcf_uncompressed_output(self, mock_run_shell_command):
+        io.write_vcf(
+            "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf", []
+        )
+
+        io.run_shell_command.assert_called_once_with(
+            "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout  > output_file.vcf",
+            raise_errors=True,
+        )
+
+    def test_write_vcf_dropped_samples(self, mock_run_shell_command):
+        io.write_vcf(
+            "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf", ["x", "y", "z"]
+        )
+
+        io.run_shell_command.assert_called_once_with(
+            "vcftools --remove-indv x --remove-indv y --remove-indv z --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout  > output_file.vcf",
+            raise_errors=True,
+        )
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 90946df78..05ecd07fb 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,11 +7,6 @@
 from test_filter import write_metadata
 
 
-@pytest.fixture
-def mock_run_shell_command(mocker):
-    mocker.patch("augur.utils.run_shell_command")
-
-
 class TestUtils:
     @pytest.mark.parametrize("extension", ["bed","BED"])
     @patch('augur.utils.read_bed_file')
@@ -111,69 +106,3 @@ def test_read_metadata(self, tmpdir):
         with pytest.raises(ValueError) as e_info:
             utils.read_metadata(meta_fn, as_data_frame=True)
         assert str(e_info.value) == "Duplicated strain in metadata: SEQ_1"
-
-    def test_read_vcf_compressed(self):
-        seq_keep, all_seq = utils.read_vcf(
-            "tests/builds/tb/data/lee_2015.vcf.gz"
-        )
-
-        assert len(seq_keep) == 150
-        assert seq_keep[149] == "G22733"
-        assert seq_keep == all_seq
-
-    def test_read_vcf_uncompressed(self):
-        seq_keep, all_seq = utils.read_vcf("tests/builds/tb/data/lee_2015.vcf")
-
-        assert len(seq_keep) == 150
-        assert seq_keep[149] == "G22733"
-        assert seq_keep == all_seq
-
-    def test_write_vcf_compressed_input(self, mock_run_shell_command):
-        utils.write_vcf(
-            "tests/builds/tb/data/lee_2015.vcf.gz", "output_file.vcf.gz", []
-        )
-
-        utils.run_shell_command.assert_called_once_with(
-            "vcftools --gzvcf tests/builds/tb/data/lee_2015.vcf.gz --recode --stdout | gzip -c > output_file.vcf.gz",
-            raise_errors=True,
-        )
-
-    def test_write_vcf_uncompressed_input(self, mock_run_shell_command):
-        utils.write_vcf(
-            "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf.gz", []
-        )
-
-        utils.run_shell_command.assert_called_once_with(
-            "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout | gzip -c > output_file.vcf.gz",
-            raise_errors=True,
-        )
-
-    def test_write_vcf_compressed_output(self, mock_run_shell_command):
-        utils.write_vcf(
-            "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf.gz", []
-        )
-
-        utils.run_shell_command.assert_called_once_with(
-            "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout | gzip -c > output_file.vcf.gz",
-            raise_errors=True,
-        )
-
-    def test_write_vcf_uncompressed_output(self, mock_run_shell_command):
-        utils.write_vcf(
-            "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf", []
-        )
-
-        utils.run_shell_command.assert_called_once_with(
-            "vcftools --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout  > output_file.vcf",
-            raise_errors=True,
-        )
-
-    def test_write_vcf_dropped_samples(self, mock_run_shell_command):
-        utils.write_vcf(
-            "tests/builds/tb/data/lee_2015.vcf", "output_file.vcf", ["x", "y", "z"]
-        )
-
-        utils.run_shell_command.assert_called_once_with(
-            "vcftools --remove-indv x --remove-indv y --remove-indv z --vcf tests/builds/tb/data/lee_2015.vcf --recode --stdout  > output_file.vcf",
-            raise_errors=True,
-        )