Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add read/write sequence interface with support for compressed sequences #652

Merged
merged 7 commits into from
Mar 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import treetime.utils

from .index import index_sequences
from .io import open_file, read_sequences, write_sequences
from .utils import read_metadata, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous

comment_char = '#'
Expand Down Expand Up @@ -545,19 +546,19 @@ def run(args):
dropped_samps = list(available_strains - seq_keep)
write_vcf(args.sequences, args.output, dropped_samps)
elif args.sequences and args.output:
sequences = SeqIO.parse(args.sequences, "fasta")
sequences = read_sequences(args.sequences)

# Stream to disk all sequences that passed all filters to avoid reading
# sequences into memory first. Track the observed strain names in the
# sequence file as part of the single pass to allow comparison with the
# provided sequence index.
observed_sequence_strains = set()
with open(args.output, "w") as output_handle:
with open_file(args.output, "wt") as output_handle:
for sequence in sequences:
observed_sequence_strains.add(sequence.id)

if sequence.id in seq_keep:
SeqIO.write(sequence, output_handle, 'fasta')
write_sequences(sequence, output_handle, 'fasta')

if sequence_strains != observed_sequence_strains:
# Warn the user if the expected strains from the sequence index are
Expand Down
21 changes: 12 additions & 9 deletions augur/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
import sys
import csv

from .io import open_file, read_sequences


def register_arguments(parser):
parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta format")
parser.add_argument('--output', '-o', help="tab-delimited file containing the number of bases per sequence in the given file. Output columns include strain, length, and counts for A, C, G, T, N, other valid IUPAC characters, ambiguous characters ('?' and '-'), and other invalid characters.", required=True)
parser.add_argument('--verbose', '-v', action="store_true", help="print index statistics to stdout")


def index_sequence(sequence, values):
"""Count the number of nucleotides for a given sequence record.

Expand Down Expand Up @@ -127,13 +131,7 @@ def index_sequences(sequences_path, sequence_index_path):
total length of sequences indexed

"""
#read in files
try:
seqs = SeqIO.parse(sequences_path, 'fasta')
except ValueError as error:
print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
print(error, file=sys.stderr)
return 1
seqs = read_sequences(sequences_path)

other_IUPAC = {'r', 'y', 's', 'w', 'k', 'm', 'd', 'h', 'b', 'v'}
values = [{'a'},{'c'},{'g'},{'t'},{'n'},other_IUPAC,{'-'},{'?'}]
Expand All @@ -142,7 +140,7 @@ def index_sequences(sequences_path, sequence_index_path):
tot_length = 0
num_of_seqs = 0

with open(sequence_index_path, 'wt') as out_file:
with open_file(sequence_index_path, 'wt') as out_file:
tsv_writer = csv.writer(out_file, delimiter = '\t')

#write header i output file
Expand All @@ -166,7 +164,12 @@ def run(args):
("?" and "-"), and other invalid characters in a set of sequences and write
the composition as a data frame to the given sequence index path.
'''
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
try:
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
except ValueError as error:
print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
print(error, file=sys.stderr)
return 1

if args.verbose:
print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
106 changes: 106 additions & 0 deletions augur/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""Interfaces for reading and writing data also known as input/output (I/O)
"""
import Bio.SeqIO
import Bio.SeqRecord
from contextlib import contextmanager
from pathlib import Path
from xopen import xopen


@contextmanager
def open_file(path_or_buffer, mode="r", **kwargs):
"""Opens a given file path and returns the handle.

Transparently handles compressed inputs and outputs.

Parameters
----------
path_or_buffer : str or Path-like or IO buffer
Name of the file to open or an existing IO buffer

mode : str
Mode to open file (read or write)

Returns
-------
IO
File handle object

"""
try:
with xopen(path_or_buffer, mode, **kwargs) as handle:
yield handle
except TypeError:
yield path_or_buffer


def read_sequences(*paths, format="fasta"):
"""Read sequences from one or more paths.

Automatically infer compression mode (e.g., gzip, etc.) and return a stream
of sequence records in the requested format (e.g., "fasta", "genbank", etc.).

Parameters
----------
paths : list of str or Path-like objects
One or more paths to sequence files of any type supported by BioPython.

format : str
Format of input sequences matching any of those supported by BioPython
(e.g., "fasta", "genbank", etc.).

Yields
------
Bio.SeqRecord.SeqRecord
Sequence record from the given path(s).

"""
for path in paths:
# Open the given path as a handle, inferring the file's compression.
# This way we can pass a handle to BioPython's SeqIO interface
# regardless of the compression mode.
with open_file(path) as handle:
sequences = Bio.SeqIO.parse(handle, format)

for sequence in sequences:
yield sequence


def write_sequences(sequences, path_or_buffer, format="fasta"):
"""Write sequences to a given path in the given format.

Automatically infer compression mode (e.g., gzip, etc.) based on the path's
filename extension.

Parameters
----------
sequences : iterable of Bio.SeqRecord.SeqRecord objects
A list-like collection of sequences to write

path_or_buffer : str or Path-like object or IO buffer
A path to a file to write the given sequences in the given format.

format : str
Format of input sequences matching any of those supported by BioPython
(e.g., "fasta", "genbank", etc.)

Returns
-------
int :
Number of sequences written out to the given path.

"""
with open_file(path_or_buffer, "wt") as handle:
# Bio.SeqIO supports writing to the same handle multiple times for specific
# file formats. For the formats we use, this function call should work for
# both a newly opened file handle or one that is provided by the caller.
# For more details see:
# https://github.com/biopython/biopython/blob/25f5152f4aeefe184a323db25694fbfe0593f0e2/Bio/SeqIO/__init__.py#L233-L251
sequences_written = Bio.SeqIO.write(
sequences,
handle,
format
)

return sequences_written
101 changes: 75 additions & 26 deletions augur/mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
from Bio import SeqIO
from Bio.Seq import MutableSeq

from .utils import run_shell_command, shquote, open_file, is_vcf, load_mask_sites, VALID_NUCLEOTIDES
from .io import open_file, read_sequences, write_sequences
from .utils import run_shell_command, shquote, is_vcf, load_mask_sites, VALID_NUCLEOTIDES

def get_chrom_name(vcf_file):
"""Read the CHROM field from the first non-header line of a vcf file.

Returns:
str or None: Either the CHROM field or None if no non-comment line could be found.
str or None: Either the CHROM field or None if no non-comment line could be found.
"""
with open_file(vcf_file, mode='r') as f:
for line in f:
Expand All @@ -29,8 +30,8 @@ def mask_vcf(mask_sites, in_file, out_file, cleanup=True):

This function relies on 'vcftools --exclude-positions' to mask the requested sites.

Parameters:
-----------
Parameters
----------
mask_sites: list[int]
A list of site indexes to exclude from the vcf.
in_file: str
Expand Down Expand Up @@ -73,13 +74,61 @@ def mask_vcf(mask_sites, in_file, out_file, cleanup=True):
except OSError:
pass


def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask_invalid):
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"""Mask characters at the given sites in a single sequence record, modifying the
record in place.

Parameters
----------
sequence : Bio.SeqIO.SeqRecord
A sequence to be masked
mask_sites: list[int]
A list of site indexes to exclude from the FASTA.
mask_from_beginning: int
Number of sites to mask from the beginning of each sequence (default 0)
mask_from_end: int
Number of sites to mask from the end of each sequence (default 0)
mask_invalid: bool
Mask invalid nucleotides (default False)

Returns
-------
Bio.SeqIO.SeqRecord
Masked sequence in its original record object

"""
# Convert to a mutable sequence to enable masking with Ns.
sequence_length = len(sequence.seq)
beginning, end = mask_from_beginning, mask_from_end
huddlej marked this conversation as resolved.
Show resolved Hide resolved

if beginning + end > sequence_length:
beginning, end = sequence_length, 0

seq = str(sequence.seq)[beginning:-end or None]

if mask_invalid:
seq = "".join(nuc if nuc in VALID_NUCLEOTIDES else "N" for nuc in seq)

masked_sequence = MutableSeq("N" * beginning + seq + "N" * end)

# Replace all excluded sites with Ns.
for site in mask_sites:
if site < sequence_length:
masked_sequence[site] = "N"

sequence.seq = masked_sequence

return sequence


def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_end=0, mask_invalid=False):
"""Mask the provided site list from a FASTA file and write to a new file.

Masked sites are overwritten as "N"s.

Parameters:
-----------
Parameters
----------
mask_sites: list[int]
A list of site indexes to exclude from the FASTA.
in_file: str
Expand All @@ -95,27 +144,27 @@ def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_e
"""
# Load alignment as FASTA generator to prevent loading the whole alignment
# into memory.
alignment = SeqIO.parse(in_file, "fasta")
alignment = read_sequences(in_file)

# Write the masked alignment to disk one record at a time.
print("Removing masked sites from FASTA file.")
with open_file(out_file, "w") as oh:
for record in alignment:
# Convert to a mutable sequence to enable masking with Ns.
sequence_length = len(record.seq)
beginning, end = mask_from_beginning, mask_from_end
if beginning + end > sequence_length:
beginning, end = sequence_length, 0
seq = str(record.seq)[beginning:-end or None]
if mask_invalid:
seq = "".join(nuc if nuc in VALID_NUCLEOTIDES else "N" for nuc in seq)
sequence = MutableSeq("N" * beginning + seq + "N" * end)
# Replace all excluded sites with Ns.
for site in mask_sites:
if site < sequence_length:
sequence[site] = "N"
record.seq = sequence
SeqIO.write(record, oh, "fasta")

masked_sequences = (
mask_sequence(
sequence,
mask_sites,
mask_from_beginning,
mask_from_end,
mask_invalid,
)
for sequence in alignment
)
sequences_written = write_sequences(
masked_sequences,
out_file,
"fasta"
)


def register_arguments(parser):
parser.add_argument('--sequences', '-s', required=True, help="sequences in VCF or FASTA format")
Expand Down Expand Up @@ -179,7 +228,7 @@ def run(args):
sys.exit(1)
mask_vcf(mask_sites, args.sequences, out_file, args.cleanup)
else:
mask_fasta(mask_sites, args.sequences, out_file,
mask_fasta(mask_sites, args.sequences, out_file,
mask_from_beginning=args.mask_from_beginning,
mask_from_end=args.mask_from_end,
mask_invalid=args.mask_invalid)
Expand Down
Loading