Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple improvements to augur clades #1199

Merged
merged 8 commits into from
May 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 98 additions & 31 deletions augur/clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@
from collections import defaultdict
import networkx as nx
from itertools import islice
from .errors import AugurError
from argparse import SUPPRESS
from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name

UNASSIGNED = 'unassigned'

def read_in_clade_definitions(clade_file):
'''
Reads in tab-seperated file that defines clades by amino acid or nucleotide mutations
Expand Down Expand Up @@ -120,10 +124,13 @@ def read_in_clade_definitions(clade_file):
if clade != root
}

if not len(clades.keys()):
raise AugurError(f"No clades were defined in {clade_file}")

return clades


def is_node_in_clade(clade_alleles, node, ref):
def is_node_in_clade(clade_alleles, node, root_sequence):
'''
Determines whether a node matches the clade definition based on sequence
For any condition, will first look in mutations stored in node.sequences,
Expand All @@ -132,11 +139,13 @@ def is_node_in_clade(clade_alleles, node, ref):
Parameters
----------
clade_alleles : list
list of clade defining alleles
list of clade defining alleles (typically supplied from the input TSV)
node : Bio.Phylo.BaseTree.Clade
node to check, assuming sequences (as mutations) are attached to node
ref : str or list
positions
node.sequences specifies nucleotides/codons which are newly observed on this node
i.e. they are the result of a mutation observed on the branch leading to this node
root_sequence : dict
{geneName: observed root sequence (list)}

Returns
-------
Expand All @@ -148,15 +157,33 @@ def is_node_in_clade(clade_alleles, node, ref):
for gene, pos, clade_state in clade_alleles:
if gene in node.sequences and pos in node.sequences[gene]:
state = node.sequences[gene][pos]
elif ref and gene in ref:
state = ref[gene][pos]
elif root_sequence and gene in root_sequence:
try:
state = root_sequence[gene][pos]
except IndexError:
raise AugurError(f"A clade definition specifies {{{gene},{pos+1},{clade_state}}} which \
is beyond the bounds of the supplied root sequence for {gene} (length {len(root_sequence[gene])})")
else:
state = ''

conditions.append(state==clade_state)

return all(conditions)

def ensure_no_multiple_mutations(all_muts):
multiples = []

for name,node in all_muts.items():
nt_positions = [int(mut[1:-1])-1 for mut in node.get('muts', [])]
if len(set(nt_positions))!=len(nt_positions):
multiples.append(f"Node {name} (nuc)")
for gene in node.get('aa_muts', {}):
aa_positions = [int(mut[1:-1])-1 for mut in node['aa_muts'][gene]]
if len(set(aa_positions))!=len(aa_positions):
multiples.append(f"Node {name} ({gene})")

if multiples:
raise AugurError(f"Multiple mutations at the same position on a single branch were found: {', '.join(multiples)}")

def assign_clades(clade_designations, all_muts, tree, ref=None):
'''
Expand Down Expand Up @@ -189,7 +216,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):

# first pass to set all nodes to unassigned as precaution to ensure attribute is set
for node in tree.find_clades(order = 'preorder'):
clade_membership[node.name] = 'unassigned'
clade_membership[node.name] = UNASSIGNED

# count leaves
for node in tree.find_clades(order = 'postorder'):
Expand All @@ -200,16 +227,16 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
c.up=node
tree.root.up = None
tree.root.sequences = {'nuc':{}}
tree.root.sequences.update({gene:{} for gene in all_muts[tree.root.name]['aa_muts']})
tree.root.sequences.update({gene:{} for gene in all_muts.get(tree.root.name, {}).get('aa_muts', {})})

# attach sequences to all nodes
for node in tree.find_clades(order='preorder'):
if node.up:
node.sequences = {gene:muts.copy() for gene, muts in node.up.sequences.items()}
for mut in all_muts[node.name]['muts']:
for mut in all_muts.get(node.name, {}).get('muts', []):
a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1]
node.sequences['nuc'][pos] = d
if 'aa_muts' in all_muts[node.name]:
if 'aa_muts' in all_muts.get(node.name, {}):
for gene in all_muts[node.name]['aa_muts']:
for mut in all_muts[node.name]['aa_muts'][gene]:
a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1]
Expand Down Expand Up @@ -242,49 +269,88 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):

return (clade_membership, clade_labels)

def warn_if_clades_not_found(membership, clade_designations):
clades = set(clade_designations.keys())
found = set([clade for clade in membership.values() if clade!=UNASSIGNED])
if not(len(found)):
print(f"WARNING in augur.clades: no clades found in tree!")
return
for clade in clades-found:
# warn loudly - one line per unfound clade
print(f"WARNING in augur.clades: clade '{clade}' not found in tree!")


def get_reference_sequence_from_root_node(all_muts, root_name):
# attach sequences to root
"""
Extracts the (nuc) sequence from the root node, if set, as well as
the (aa) sequences. Returns a dictionary of {geneName: rootSequence}
where rootSequence is a list and geneName may be 'nuc'.
"""
ref = {}
try:
ref['nuc'] = list(all_muts[root_name]["sequence"])
except:
print("WARNING in augur.clades: nucleotide mutation json does not contain full sequences for the root node.")

if "aa_muts" in all_muts[root_name]:
# the presence of a single mutation will imply that the corresponding reference
# sequence should be present, and we will warn if it is not
nt_present = False
genes_present = set([])
missing = []
for d in all_muts.values():
if "muts" in d:
nt_present = True
genes_present.update(d.get('aa_muts', {}).keys())

if nt_present:
try:
ref.update({gene:list(seq) for gene, seq in all_muts[root_name]["aa_sequences"].items()})
except:
print("WARNING in augur.clades: amino acid mutation json does not contain full sequences for the root node.")
ref['nuc'] = list(all_muts.get(root_name, {})["sequence"])
except KeyError:
missing.append("nuc")

for gene in genes_present:
try:
ref[gene] = list(all_muts.get(root_name, {}).get("aa_sequences", {})[gene])
except KeyError:
missing.append(gene)

if missing:
print(f"WARNING in augur.clades: sequences at the root node have not been specified for {{{', '.join(missing)}}}, \
even though mutations were observed. Clades which are annotated using bases/codons present at the root \
of the tree may not be correctly inferred.")

return ref

def parse_nodes(tree_file, node_data_files):
tree = Phylo.read(tree_file, 'newick')
# don't supply tree to read_node_data as we don't want to require that every node is present in the node_data JSONs
node_data = read_node_data(node_data_files)
# node_data files can be parsed without 'nodes' (if they have 'branches')
if "nodes" not in node_data or len(node_data['nodes'].keys())==0:
raise AugurError(f"No nodes found in the supplied node data files. Please check {', '.join(node_data_files)}")
json_nodes = set(node_data["nodes"].keys())
tree_nodes = set([clade.name for clade in tree.find_clades()])
if not json_nodes.issubset(tree_nodes):
raise AugurError(f"The following nodes in the node_data files ({', '.join(node_data_files)}) are not found in the tree ({tree_file}): {', '.join(json_nodes - tree_nodes)}")
ensure_no_multiple_mutations(node_data['nodes'])
return (tree, node_data['nodes'])

def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("clades", help=__doc__)
parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to')
parser.add_argument('--tree', required=True, help="prebuilt Newick -- no tree will be built if provided")
parser.add_argument('--mutations', required=True, metavar="NODE_DATA_JSON", nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
parser.add_argument('--reference', nargs='+', help=SUPPRESS)
parser.add_argument('--clades', required=True, metavar="TSV", type=str, help='TSV file containing clade definitions by amino-acid')
parser.add_argument('--output-node-data', type=str, metavar="NODE_DATA_JSON", help='name of JSON file to save clade assignments to')
parser.add_argument('--membership-name', type=str, default="clade_membership", help='Key to store clade membership under; use "None" to not export this')
parser.add_argument('--label-name', type=str, default="clade", help='Key to store clade labels under; use "None" to not export this')
return parser


def run(args):
## read tree and data, if reading data fails, return with error code
tree = Phylo.read(args.tree, 'newick')
node_data = read_node_data(args.mutations, args.tree)
if node_data is None:
print("ERROR: could not read node data (incl sequences)")
return 1
all_muts = node_data['nodes']
(tree, all_muts) = parse_nodes(args.tree, args.mutations)

if args.reference:
# PLACE HOLDER FOR vcf WORKFLOW.
# Works without a reference for now but can be added if clade defs contain positions
# that are monomorphic across reference and sequence sample.
print(f"WARNING in augur.clades: You have provided a --reference file(s) ({args.reference}) however this is functionality is not yet supported.")
ref = None
else:
# extract reference sequences from the root node entry in the mutation json
Expand All @@ -294,6 +360,7 @@ def run(args):
clade_designations = read_in_clade_definitions(args.clades)

membership, labels = assign_clades(clade_designations, all_muts, tree, ref)
warn_if_clades_not_found(membership, clade_designations)

membership_key= args.membership_name if args.membership_name.upper() != "NONE" else None
label_key= args.label_name if args.label_name.upper() != "NONE" else None
Expand Down
50 changes: 50 additions & 0 deletions tests/functional/clades.t
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,53 @@ Test the ability to _not_ export a branch label (same logic as not exporting the
$ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades_no-labels.json" \
> --exclude-paths "root['generated_by']"
{'dictionary_item_removed': [root['branches']]}


A clade which exists at the root is not identified by inferring the root state
(i.e. we don't infer the root state to be A if we observe a subsequent A10T mutation)
This is an oversight and ideally would be fixed

$ ${AUGUR} clades \
> --tree clades/toy_tree.nwk \
> --mutations clades/toy_muts_no_ref.json \
> --clades clades/toy_clades_nuc.tsv \
> --output-node-data "$TMP/toy_clades_1.json" &>/dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_1.json "$TMP/toy_clades_1.json" \
> --exclude-paths "root['generated_by']"
{}

A clade which exists at the root is identified (and correctly propogated) if the root sequence
is explicitly set.

$ ${AUGUR} clades \
> --tree clades/toy_tree.nwk \
> --mutations clades/toy_muts_ref.json \
> --clades clades/toy_clades_nuc.tsv \
> --output-node-data "$TMP/toy_clades_2a.json" &>/dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2a.json" \
> --exclude-paths "root['generated_by']"
{}

A clade which exists at the root is identified (and correctly propogated) without a root sequence
if the (branch leading to the) root has the clade-defining mutation.

$ ${AUGUR} clades \
> --tree clades/toy_tree.nwk \
> --mutations clades/toy_muts_explicit_root_mutation.json \
> --clades clades/toy_clades_nuc.tsv \
> --output-node-data "$TMP/toy_clades_2b.json" &>/dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2b.json" \
> --exclude-paths "root['generated_by']"
{}

Multiple mutations at the same position on a single branch are a fatal error

$ ${AUGUR} clades \
> --tree clades/toy_tree.nwk \
> --mutations clades/toy_muts_multiple.json \
> --clades clades/toy_clades_nuc.tsv
ERROR: Multiple mutations at the same position on a single branch were found: Node A (nuc), Node AB (geneName)
[2]
30 changes: 30 additions & 0 deletions tests/functional/clades/toy_clades_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"branches": {
"A": {
"labels": {
"clade": "clade_2"
}
}
},
"generated_by": {
"program": "augur",
"version": "21.1.0"
},
"nodes": {
"A": {
"clade_membership": "clade_2"
},
"AB": {
"clade_membership": "unassigned"
},
"B": {
"clade_membership": "unassigned"
},
"C": {
"clade_membership": "unassigned"
},
"ROOT": {
"clade_membership": "unassigned"
}
}
}
35 changes: 35 additions & 0 deletions tests/functional/clades/toy_clades_2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"branches": {
"A": {
"labels": {
"clade": "clade_2"
}
},
"ROOT": {
"labels": {
"clade": "clade_1"
}
}
},
"generated_by": {
"program": "augur",
"version": "21.1.0"
},
"nodes": {
"A": {
"clade_membership": "clade_2"
},
"AB": {
"clade_membership": "clade_1"
},
"B": {
"clade_membership": "clade_1"
},
"C": {
"clade_membership": "clade_1"
},
"ROOT": {
"clade_membership": "clade_1"
}
}
}
3 changes: 3 additions & 0 deletions tests/functional/clades/toy_clades_nuc.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
clade gene site alt
clade_1 nuc 10 A
clade_2 nuc 10 T
10 changes: 10 additions & 0 deletions tests/functional/clades/toy_muts_explicit_root_mutation.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"nodes": {
"ROOT": {
"muts": ["C10A"]
},
"A": {
"muts": ["A10T"]
}
}
}
15 changes: 15 additions & 0 deletions tests/functional/clades/toy_muts_multiple.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"nodes": {
"A": {
"muts": ["A10T", "T10C"]
},
"AB": {
"aa_muts": {
"geneName": ["S42L", "R42H", "Y50W"]
}
},
"B": {
"muts": ["A10T", "T11C"]
}
}
}
7 changes: 7 additions & 0 deletions tests/functional/clades/toy_muts_no_ref.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"nodes": {
"A": {
"muts": ["A10T"]
}
}
}
Loading