nextstrain · huddlej · Mar 18, 2021 · Dec 24, 2020 · Mar 10, 2021 · Dec 30, 2020
diff --git a/augur/filter.py b/augur/filter.py
@@ -14,6 +14,7 @@
 import treetime.utils
 
 from .index import index_sequences
+from .io import open_file, read_sequences, write_sequences
 from .utils import read_metadata, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
 
 comment_char = '#'
@@ -545,19 +546,19 @@ def run(args):
         dropped_samps = list(available_strains - seq_keep)
         write_vcf(args.sequences, args.output, dropped_samps)
     elif args.sequences and args.output:
-        sequences = SeqIO.parse(args.sequences, "fasta")
+        sequences = read_sequences(args.sequences)
 
         # Stream to disk all sequences that passed all filters to avoid reading
         # sequences into memory first. Track the observed strain names in the
         # sequence file as part of the single pass to allow comparison with the
         # provided sequence index.
         observed_sequence_strains = set()
-        with open(args.output, "w") as output_handle:
+        with open_file(args.output, "wt") as output_handle:
             for sequence in sequences:
                 observed_sequence_strains.add(sequence.id)
 
                 if sequence.id in seq_keep:
-                    SeqIO.write(sequence, output_handle, 'fasta')
+                    write_sequences(sequence, output_handle, 'fasta')
 
         if sequence_strains != observed_sequence_strains:
             # Warn the user if the expected strains from the sequence index are

diff --git a/augur/index.py b/augur/index.py
@@ -8,11 +8,15 @@
 import sys
 import csv
 
+from .io import open_file, read_sequences
+
+
 def register_arguments(parser):
     parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta format")
     parser.add_argument('--output', '-o', help="tab-delimited file containing the number of bases per sequence in the given file. Output columns include strain, length, and counts for A, C, G, T, N, other valid IUPAC characters, ambiguous characters ('?' and '-'), and other invalid characters.", required=True)
     parser.add_argument('--verbose', '-v', action="store_true", help="print index statistics to stdout")
 
+
 def index_sequence(sequence, values):
     """Count the number of nucleotides for a given sequence record.
 
@@ -127,13 +131,7 @@ def index_sequences(sequences_path, sequence_index_path):
         total length of sequences indexed
 
     """
-    #read in files
-    try:
-        seqs = SeqIO.parse(sequences_path, 'fasta')
-    except ValueError as error:
-        print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
-        print(error, file=sys.stderr)
-        return 1
+    seqs = read_sequences(sequences_path)
 
     other_IUPAC = {'r', 'y', 's', 'w', 'k', 'm', 'd', 'h', 'b', 'v'}
     values = [{'a'},{'c'},{'g'},{'t'},{'n'},other_IUPAC,{'-'},{'?'}]
@@ -142,7 +140,7 @@ def index_sequences(sequences_path, sequence_index_path):
     tot_length = 0
     num_of_seqs = 0
 
-    with open(sequence_index_path, 'wt') as out_file:
+    with open_file(sequence_index_path, 'wt') as out_file:
         tsv_writer = csv.writer(out_file, delimiter = '\t')
 
         #write header i output file
@@ -166,7 +164,12 @@ def run(args):
     ("?" and "-"), and other invalid characters in a set of sequences and write
     the composition as a data frame to the given sequence index path.
     '''
-    num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
+    try:
+        num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
+    except ValueError as error:
+        print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
+        print(error, file=sys.stderr)
+        return 1
 
     if args.verbose:
         print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
diff --git a/augur/io.py b/augur/io.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""Interfaces for reading and writing data also known as input/output (I/O)
+"""
+import Bio.SeqIO
+import Bio.SeqRecord
+from contextlib import contextmanager
+from pathlib import Path
+from xopen import xopen
+
+
+@contextmanager
+def open_file(path_or_buffer, mode="r", **kwargs):
+    """Opens a given file path and returns the handle.
+
+    Transparently handles compressed inputs and outputs.
+
+    Parameters
+    ----------
+    path_or_buffer : str or Path-like or IO buffer
+        Name of the file to open or an existing IO buffer
+
+    mode : str
+        Mode to open file (read or write)
+
+    Returns
+    -------
+    IO
+        File handle object
+
+    """
+    try:
+        with xopen(path_or_buffer, mode, **kwargs) as handle:
+            yield handle
+    except TypeError:
+        yield path_or_buffer
+
+
+def read_sequences(*paths, format="fasta"):
+    """Read sequences from one or more paths.
+
+    Automatically infer compression mode (e.g., gzip, etc.) and return a stream
+    of sequence records in the requested format (e.g., "fasta", "genbank", etc.).
+
+    Parameters
+    ----------
+    paths : list of str or Path-like objects
+        One or more paths to sequence files of any type supported by BioPython.
+
+    format : str
+        Format of input sequences matching any of those supported by BioPython
+        (e.g., "fasta", "genbank", etc.).
+
+    Yields
+    ------
+    Bio.SeqRecord.SeqRecord
+        Sequence record from the given path(s).
+
+    """
+    for path in paths:
+        # Open the given path as a handle, inferring the file's compression.
+        # This way we can pass a handle to BioPython's SeqIO interface
+        # regardless of the compression mode.
+        with open_file(path) as handle:
+            sequences = Bio.SeqIO.parse(handle, format)
+
+            for sequence in sequences:
+                yield sequence
+
+
+def write_sequences(sequences, path_or_buffer, format="fasta"):
+    """Write sequences to a given path in the given format.
+
+    Automatically infer compression mode (e.g., gzip, etc.) based on the path's
+    filename extension.
+
+    Parameters
+    ----------
+    sequences : iterable of Bio.SeqRecord.SeqRecord objects
+        A list-like collection of sequences to write
+
+    path_or_buffer : str or Path-like object or IO buffer
+        A path to a file to write the given sequences in the given format.
+
+    format : str
+        Format of input sequences matching any of those supported by BioPython
+        (e.g., "fasta", "genbank", etc.)
+
+    Returns
+    -------
+    int :
+        Number of sequences written out to the given path.
+
+    """
+    with open_file(path_or_buffer, "wt") as handle:
+        # Bio.SeqIO supports writing to the same handle multiple times for specific
+        # file formats. For the formats we use, this function call should work for
+        # both a newly opened file handle or one that is provided by the caller.
+        # For more details see:
+        # https://github.com/biopython/biopython/blob/25f5152f4aeefe184a323db25694fbfe0593f0e2/Bio/SeqIO/__init__.py#L233-L251
+        sequences_written = Bio.SeqIO.write(
+            sequences,
+            handle,
+            format
+        )
+
+    return sequences_written
diff --git a/augur/mask.py b/augur/mask.py
@@ -10,13 +10,14 @@
 from Bio import SeqIO
 from Bio.Seq import MutableSeq
 
-from .utils import run_shell_command, shquote, open_file, is_vcf, load_mask_sites, VALID_NUCLEOTIDES
+from .io import open_file, read_sequences, write_sequences
+from .utils import run_shell_command, shquote, is_vcf, load_mask_sites, VALID_NUCLEOTIDES
 
 def get_chrom_name(vcf_file):
     """Read the CHROM field from the first non-header line of a vcf file.
-    
+
     Returns:
-    str or None: Either the CHROM field or None if no non-comment line could be found. 
+    str or None: Either the CHROM field or None if no non-comment line could be found.
     """
     with open_file(vcf_file, mode='r') as f:
         for line in f:
@@ -29,8 +30,8 @@ def mask_vcf(mask_sites, in_file, out_file, cleanup=True):
 
     This function relies on 'vcftools --exclude-positions' to mask the requested sites.
 
-    Parameters:
-    -----------
+    Parameters
+    ----------
     mask_sites: list[int]
         A list of site indexes to exclude from the vcf.
     in_file: str
@@ -73,13 +74,61 @@ def mask_vcf(mask_sites, in_file, out_file, cleanup=True):
             except OSError:
                 pass
 
+
+def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask_invalid):
+    """Mask characters at the given sites in a single sequence record, modifying the
+    record in place.
+
+    Parameters
+    ----------
+    sequence : Bio.SeqIO.SeqRecord
+        A sequence to be masked
+    mask_sites: list[int]
+        A list of site indexes to exclude from the FASTA.
+    mask_from_beginning: int
+        Number of sites to mask from the beginning of each sequence (default 0)
+    mask_from_end: int
+        Number of sites to mask from the end of each sequence (default 0)
+    mask_invalid: bool
+        Mask invalid nucleotides (default False)
+
+    Returns
+    -------
+    Bio.SeqIO.SeqRecord
+        Masked sequence in its original record object
+
+    """
+    # Convert to a mutable sequence to enable masking with Ns.
+    sequence_length = len(sequence.seq)
+    beginning, end = mask_from_beginning, mask_from_end
+
+    if beginning + end > sequence_length:
+        beginning, end = sequence_length, 0
+
+    seq = str(sequence.seq)[beginning:-end or None]
+
+    if mask_invalid:
+        seq = "".join(nuc if nuc in VALID_NUCLEOTIDES else "N" for nuc in seq)
+
+    masked_sequence = MutableSeq("N" * beginning + seq + "N" * end)
+
+    # Replace all excluded sites with Ns.
+    for site in mask_sites:
+        if site < sequence_length:
+            masked_sequence[site] = "N"
+
+    sequence.seq = masked_sequence
+
+    return sequence
+
+
 def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_end=0, mask_invalid=False):
     """Mask the provided site list from a FASTA file and write to a new file.
 
     Masked sites are overwritten as "N"s.
 
-    Parameters:
-    -----------
+    Parameters
+    ----------
     mask_sites: list[int]
         A list of site indexes to exclude from the FASTA.
     in_file: str
@@ -95,27 +144,27 @@ def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_e
     """
     # Load alignment as FASTA generator to prevent loading the whole alignment
     # into memory.
-    alignment = SeqIO.parse(in_file, "fasta")
+    alignment = read_sequences(in_file)
 
     # Write the masked alignment to disk one record at a time.
     print("Removing masked sites from FASTA file.")
-    with open_file(out_file, "w") as oh:
-        for record in alignment:
-            # Convert to a mutable sequence to enable masking with Ns.
-            sequence_length = len(record.seq)
-            beginning, end = mask_from_beginning, mask_from_end
-            if beginning + end > sequence_length:
-                beginning, end = sequence_length, 0
-            seq = str(record.seq)[beginning:-end or None]
-            if mask_invalid:
-                seq = "".join(nuc if nuc in VALID_NUCLEOTIDES else "N" for nuc in seq)
-            sequence = MutableSeq("N" * beginning + seq + "N" * end)
-            # Replace all excluded sites with Ns.
-            for site in mask_sites:
-                if site < sequence_length:
-                    sequence[site] = "N"
-            record.seq = sequence
-            SeqIO.write(record, oh, "fasta")
+
+    masked_sequences = (
+        mask_sequence(
+            sequence,
+            mask_sites,
+            mask_from_beginning,
+            mask_from_end,
+            mask_invalid,
+        )
+        for sequence in alignment
+    )
+    sequences_written = write_sequences(
+        masked_sequences,
+        out_file,
+        "fasta"
+    )
+
 
 def register_arguments(parser):
     parser.add_argument('--sequences', '-s', required=True, help="sequences in VCF or FASTA format")
@@ -179,7 +228,7 @@ def run(args):
             sys.exit(1)
         mask_vcf(mask_sites, args.sequences, out_file, args.cleanup)
     else:
-        mask_fasta(mask_sites, args.sequences, out_file, 
+        mask_fasta(mask_sites, args.sequences, out_file,
                    mask_from_beginning=args.mask_from_beginning,
                    mask_from_end=args.mask_from_end,
                    mask_invalid=args.mask_invalid)