-
Notifications
You must be signed in to change notification settings - Fork 403
/
mask-alignment.py
54 lines (48 loc) · 2.11 KB
/
mask-alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
Mask initial bases from alignment FASTA
"""
import argparse
from augur.io import open_file, read_sequences, write_sequences
import Bio
import Bio.SeqIO
from Bio.Seq import Seq
def mask_terminal_gaps(seq):
L = len(seq)
seq_trimmed = seq.lstrip('-')
left_gaps = L - len(seq_trimmed)
seq_trimmed = seq_trimmed.rstrip('-')
right_gaps = L - len(seq_trimmed) - left_gaps
return "N"*left_gaps + seq_trimmed + "N"*right_gaps
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Mask initial bases from alignment FASTA",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--alignment", required=True, help="FASTA file of alignment")
parser.add_argument("--mask-terminal-gaps", action='store_true', help="fill all terminal gaps with N as they likely represent missing data")
parser.add_argument("--mask-from-beginning", type = int, required=True, help="number of bases to mask from start")
parser.add_argument("--mask-from-end", type = int, help="number of bases to mask from end")
parser.add_argument("--mask-sites", nargs='+', type = int, help="list of sites to mask")
parser.add_argument("--output", required=True, help="FASTA file of output alignment")
args = parser.parse_args()
begin_length = 0
if args.mask_from_beginning:
begin_length = args.mask_from_beginning
end_length = 0
if args.mask_from_end:
end_length = args.mask_from_end
with open_file(args.output, 'w') as outfile:
for record in read_sequences(args.alignment):
seq = str(record.seq)
if args.mask_terminal_gaps:
seq = mask_terminal_gaps(seq)
start = "N" * begin_length
middle = seq[begin_length:-end_length]
end = "N" * end_length
seq_list = list(start + middle + end)
if args.mask_sites:
for site in args.mask_sites:
if seq_list[site-1]!='-':
seq_list[site-1] = "N"
record.seq = Seq("".join(seq_list))
write_sequences(record, outfile)