Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Augur Mask: Add additional options from NCOV mask-alignment.py script #512

Merged
merged 17 commits into from
Apr 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 47 additions & 16 deletions augur/mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import MutableSeq

from .utils import run_shell_command, shquote, open_file, is_vcf

Expand All @@ -32,7 +33,7 @@ def read_bed_file(mask_file):
bed = pd.read_csv(mask_file, sep='\t', header=None, usecols=[1,2])
for idx, row in bed.iterrows():
try:
sites_to_mask.extend(list(range(int(row[1]), int(row[2])+1)))
sites_to_mask.extend(range(int(row[1]), int(row[2])))
except ValueError as err:
# Skip unparseable lines, including header lines.
print("Could not read line %d of BED file %s: %s. Continuing." % (idx, mask_file, err))
Expand Down Expand Up @@ -66,7 +67,8 @@ def mask_vcf(mask_sites, in_file, out_file, cleanup=True):
"Please check the file is valid VCF format.")
sys.exit(1)

exclude = [chrom_name + "\t" + str(pos) for pos in mask_sites]
# mask_sites is zero-indexed, VCFTools expects 1-indexed.
exclude = [chrom_name + "\t" + str(pos + 1) for pos in mask_sites]
temp_mask_file = in_file + "_maskTemp"
with open_file(temp_mask_file, 'w') as fh:
fh.write("\n".join(exclude))
Expand All @@ -88,7 +90,7 @@ def mask_vcf(mask_sites, in_file, out_file, cleanup=True):
except OSError:
pass

def mask_fasta(mask_sites, in_file, out_file):
def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_end=0):
"""Mask the provided site list from a FASTA file and write to a new file.

Masked sites are overwritten as "N"s.
Expand All @@ -101,6 +103,10 @@ def mask_fasta(mask_sites, in_file, out_file):
The path to the FASTA file you wish to mask.
out_file: str
The path to write the resulting FASTA to
mask_from_beginning: int
Number of sites to mask from the beginning of each sequence (default 0)
mask_from_end: int
Number of sites to mask from the end of each sequence (default 0)
"""
# Load alignment as FASTA generator to prevent loading the whole alignment
# into memory.
Expand All @@ -111,8 +117,15 @@ def mask_fasta(mask_sites, in_file, out_file):
with open_file(out_file, "w") as oh:
for record in alignment:
# Convert to a mutable sequence to enable masking with Ns.
sequence = record.seq.tomutable()
sequence_length = len(sequence)
sequence_length = len(record.seq)
beginning, end = mask_from_beginning, mask_from_end
if beginning + end > sequence_length:
beginning, end = sequence_length, 0
sequence = MutableSeq(
"N" * beginning +
str(record.seq)[beginning:-end or None] +
"N" * end
)
# Replace all excluded sites with Ns.
for site in mask_sites:
if site < sequence_length:
Expand All @@ -122,7 +135,10 @@ def mask_fasta(mask_sites, in_file, out_file):

def register_arguments(parser):
parser.add_argument('--sequences', '-s', required=True, help="sequences in VCF or FASTA format")
parser.add_argument('--mask', required=True, help="locations to be masked in BED file format")
parser.add_argument('--mask', dest="mask_file", required=False, help="locations to be masked in BED file format")
parser.add_argument('--mask-from-beginning', type=int, default=0, help="FASTA Only: Number of sites to mask from beginning")
parser.add_argument('--mask-from-end', type=int, default=0, help="FASTA Only: Number of sites to mask from end")
parser.add_argument("--mask-sites", nargs='+', type = int, help="1-indexed list of sites to mask")
parser.add_argument('--output', '-o', help="output file")
parser.add_argument('--no-cleanup', dest="cleanup", action="store_false",
help="Leave intermediate files around. May be useful for debugging")
Expand All @@ -141,19 +157,29 @@ def run(args):
# Check files exist and are not empty
if not os.path.isfile(args.sequences):
print("ERROR: File {} does not exist!".format(args.sequences))
return 1
if not os.path.isfile(args.mask):
print("ERROR: File {} does not exist!".format(args.mask))
return 1
sys.exit(1)
if os.path.getsize(args.sequences) == 0:
print("ERROR: {} is empty. Please check how this file was produced. "
"Did an error occur in an earlier step?".format(args.sequences))
return 1
if os.path.getsize(args.mask) == 0:
print("ERROR: {} is an empty file.".format(args.mask))
return 1
sys.exit(1)
if args.mask_file:
if not os.path.isfile(args.mask_file):
print("ERROR: File {} does not exist!".format(args.mask_file))
sys.exit(1)
if os.path.getsize(args.mask_file) == 0:
print("ERROR: {} is an empty file.".format(args.mask_file))
sys.exit(1)
if not any((args.mask_file, args.mask_from_beginning, args.mask_from_end, args.mask_sites)):
print("No masking sites provided. Must include one of --mask, --mask-from-beginning, --mask-from-end, or --mask-sites")
sys.exit(1)

mask_sites = read_bed_file(args.mask)
mask_sites = set()
if args.mask_sites:
# Mask sites passed in as 1-indexed
mask_sites.update(site - 1 for site in args.mask_sites)
if args.mask_file:
mask_sites.update(read_bed_file(args.mask_file))
mask_sites = sorted(mask_sites)

# For both FASTA and VCF masking, we need a proper separate output file
if args.output is not None:
Expand All @@ -163,9 +189,14 @@ def run(args):
"masked_" + os.path.basename(args.sequences))

if is_vcf(args.sequences):
if args.mask_from_beginning or args.mask_from_end:
print("Cannot use --mask-from-beginning or --mask-from-end with VCF files!")
sys.exit(1)
mask_vcf(mask_sites, args.sequences, out_file, args.cleanup)
else:
mask_fasta(mask_sites, args.sequences, out_file)
mask_fasta(mask_sites, args.sequences, out_file,
mask_from_beginning=args.mask_from_beginning,
mask_from_end=args.mask_from_end)

if args.output is None:
copyfile(out_file, args.sequences)
Expand Down
Loading