nextstrain · jameshadfield · May 4, 2023 · Apr 11, 2023 · Apr 11, 2023 · Apr 11, 2023
diff --git a/augur/clades.py b/augur/clades.py
@@ -18,8 +18,12 @@
 from collections import defaultdict
 import networkx as nx
 from itertools import islice
+from .errors import AugurError
+from argparse import SUPPRESS
 from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name
 
+UNASSIGNED = 'unassigned'
+
 def read_in_clade_definitions(clade_file):
     '''
     Reads in tab-seperated file that defines clades by amino acid or nucleotide mutations
@@ -120,10 +124,13 @@ def read_in_clade_definitions(clade_file):
         if clade != root
     }
 
+    if not len(clades.keys()):
+        raise AugurError(f"No clades were defined in {clade_file}")
+
     return clades
 
 
-def is_node_in_clade(clade_alleles, node, ref):
+def is_node_in_clade(clade_alleles, node, root_sequence):
     '''
     Determines whether a node matches the clade definition based on sequence
     For any condition, will first look in mutations stored in node.sequences,
@@ -132,11 +139,13 @@ def is_node_in_clade(clade_alleles, node, ref):
     Parameters
     ----------
     clade_alleles : list
-        list of clade defining alleles
+        list of clade defining alleles (typically supplied from the input TSV)
     node : Bio.Phylo.BaseTree.Clade
         node to check, assuming sequences (as mutations) are attached to node
-    ref : str or list
-        positions
+        node.sequences specifies nucleotides/codons which are newly observed on this node
+        i.e. they are the result of a mutation observed on the branch leading to this node
+    root_sequence : dict
+        {geneName: observed root sequence (list)}
 
     Returns
     -------
@@ -148,15 +157,33 @@ def is_node_in_clade(clade_alleles, node, ref):
     for gene, pos, clade_state in clade_alleles:
         if gene in node.sequences and pos in node.sequences[gene]:
             state = node.sequences[gene][pos]
-        elif ref and gene in ref:
-            state = ref[gene][pos]
+        elif root_sequence and gene in root_sequence:
+            try:
+                state = root_sequence[gene][pos]
+            except IndexError:
+                raise AugurError(f"A clade definition specifies {{{gene},{pos+1},{clade_state}}} which \
+is beyond the bounds of the supplied root sequence for {gene} (length {len(root_sequence[gene])})")
         else:
             state = ''
 
         conditions.append(state==clade_state)
 
     return all(conditions)
 
+def ensure_no_multiple_mutations(all_muts):
+    multiples = []
+
+    for name,node in all_muts.items():
+        nt_positions = [int(mut[1:-1])-1  for mut in node.get('muts', [])]
+        if len(set(nt_positions))!=len(nt_positions):
+            multiples.append(f"Node {name} (nuc)")
+        for gene in node.get('aa_muts', {}):
+            aa_positions = [int(mut[1:-1])-1 for mut in node['aa_muts'][gene]]
+            if len(set(aa_positions))!=len(aa_positions):
+                multiples.append(f"Node {name} ({gene})")
+
+    if multiples:
+        raise AugurError(f"Multiple mutations at the same position on a single branch were found: {', '.join(multiples)}")
 
 def assign_clades(clade_designations, all_muts, tree, ref=None):
     '''
@@ -189,7 +216,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
 
     # first pass to set all nodes to unassigned as precaution to ensure attribute is set
     for node in tree.find_clades(order = 'preorder'):
-        clade_membership[node.name] = 'unassigned'
+        clade_membership[node.name] = UNASSIGNED
 
     # count leaves
     for node in tree.find_clades(order = 'postorder'):
@@ -200,16 +227,16 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
             c.up=node
     tree.root.up = None
     tree.root.sequences = {'nuc':{}}
-    tree.root.sequences.update({gene:{} for gene in all_muts[tree.root.name]['aa_muts']})
+    tree.root.sequences.update({gene:{} for gene in all_muts.get(tree.root.name, {}).get('aa_muts', {})})
 
     # attach sequences to all nodes
     for node in tree.find_clades(order='preorder'):
         if node.up:
             node.sequences = {gene:muts.copy() for gene, muts in node.up.sequences.items()}
-        for mut in all_muts[node.name]['muts']:
+        for mut in all_muts.get(node.name, {}).get('muts', []):
             a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1]
             node.sequences['nuc'][pos] = d
-        if 'aa_muts' in all_muts[node.name]:
+        if 'aa_muts' in all_muts.get(node.name, {}):
             for gene in all_muts[node.name]['aa_muts']:
                 for mut in all_muts[node.name]['aa_muts'][gene]:
                     a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1]
@@ -242,49 +269,88 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
 
     return (clade_membership, clade_labels)
 
+def warn_if_clades_not_found(membership, clade_designations):
+    clades = set(clade_designations.keys())
+    found = set([clade for clade in membership.values() if clade!=UNASSIGNED])
+    if not(len(found)):
+        print(f"WARNING in augur.clades: no clades found in tree!")
+        return
+    for clade in clades-found:
+        # warn loudly - one line per unfound clade
+        print(f"WARNING in augur.clades: clade '{clade}' not found in tree!")
+
 
 def get_reference_sequence_from_root_node(all_muts, root_name):
-    # attach sequences to root
+    """
+    Extracts the (nuc) sequence from the root node, if set, as well as
+    the (aa) sequences. Returns a dictionary of {geneName: rootSequence}
+    where rootSequence is a list and geneName may be 'nuc'.
+    """
     ref = {}
-    try:
-        ref['nuc'] = list(all_muts[root_name]["sequence"])
-    except:
-        print("WARNING in augur.clades: nucleotide mutation json does not contain full sequences for the root node.")
 
-    if "aa_muts" in all_muts[root_name]:
+    # the presence of a single mutation will imply that the corresponding reference
+    # sequence should be present, and we will warn if it is not
+    nt_present = False
+    genes_present = set([])
+    missing = []
+    for d in all_muts.values():
+        if "muts" in d:
+            nt_present = True
+        genes_present.update(d.get('aa_muts', {}).keys())
+
+    if nt_present:
         try:
-            ref.update({gene:list(seq) for gene, seq in all_muts[root_name]["aa_sequences"].items()})
-        except:
-            print("WARNING in augur.clades: amino acid mutation json does not contain full sequences for the root node.")
+            ref['nuc'] = list(all_muts.get(root_name, {})["sequence"])
+        except KeyError:
+            missing.append("nuc")
+
+    for gene in genes_present:
+        try:
+            ref[gene] = list(all_muts.get(root_name, {}).get("aa_sequences", {})[gene])
+        except KeyError:
+            missing.append(gene)
+
+    if missing:            
+        print(f"WARNING in augur.clades: sequences at the root node have not been specified for {{{', '.join(missing)}}}, \
+even though mutations were observed. Clades which are annotated using bases/codons present at the root \
+of the tree may not be correctly inferred.")
 
     return ref
 
+def parse_nodes(tree_file, node_data_files):
+    tree = Phylo.read(tree_file, 'newick')
+    # don't supply tree to read_node_data as we don't want to require that every node is present in the node_data JSONs
+    node_data = read_node_data(node_data_files)
+    # node_data files can be parsed without 'nodes' (if they have 'branches')
+    if "nodes" not in node_data or len(node_data['nodes'].keys())==0:
+        raise AugurError(f"No nodes found in the supplied node data files. Please check {', '.join(node_data_files)}")
+    json_nodes = set(node_data["nodes"].keys())
+    tree_nodes = set([clade.name for clade in tree.find_clades()])
+    if not json_nodes.issubset(tree_nodes):
+        raise AugurError(f"The following nodes in the node_data files ({', '.join(node_data_files)}) are not found in the tree ({tree_file}): {', '.join(json_nodes - tree_nodes)}")
+    ensure_no_multiple_mutations(node_data['nodes'])
+    return (tree, node_data['nodes'])
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("clades", help=__doc__)
-    parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
-    parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
-    parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
-    parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
-    parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to')
+    parser.add_argument('--tree', required=True, help="prebuilt Newick -- no tree will be built if provided")
+    parser.add_argument('--mutations', required=True, metavar="NODE_DATA_JSON", nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
+    parser.add_argument('--reference', nargs='+', help=SUPPRESS)
+    parser.add_argument('--clades', required=True, metavar="TSV", type=str, help='TSV file containing clade definitions by amino-acid')
+    parser.add_argument('--output-node-data', type=str,  metavar="NODE_DATA_JSON", help='name of JSON file to save clade assignments to')
     parser.add_argument('--membership-name', type=str, default="clade_membership", help='Key to store clade membership under; use "None" to not export this')
     parser.add_argument('--label-name', type=str, default="clade", help='Key to store clade labels under; use "None" to not export this')
     return parser
 
 
 def run(args):
-    ## read tree and data, if reading data fails, return with error code
-    tree = Phylo.read(args.tree, 'newick')
-    node_data = read_node_data(args.mutations, args.tree)
-    if node_data is None:
-        print("ERROR: could not read node data (incl sequences)")
-        return 1
-    all_muts = node_data['nodes']
+    (tree, all_muts) = parse_nodes(args.tree, args.mutations)
 
     if args.reference:
         # PLACE HOLDER FOR vcf WORKFLOW.
         # Works without a reference for now but can be added if clade defs contain positions
         # that are monomorphic across reference and sequence sample.
+        print(f"WARNING in augur.clades: You have provided a --reference file(s) ({args.reference}) however this is functionality is not yet supported.")
         ref = None
     else:
         # extract reference sequences from the root node entry in the mutation json
@@ -294,6 +360,7 @@ def run(args):
     clade_designations = read_in_clade_definitions(args.clades)
 
     membership, labels = assign_clades(clade_designations, all_muts, tree, ref)
+    warn_if_clades_not_found(membership, clade_designations)
 
     membership_key= args.membership_name if args.membership_name.upper() != "NONE" else None
     label_key= args.label_name if args.label_name.upper() != "NONE" else None

diff --git a/tests/functional/clades.t b/tests/functional/clades.t
@@ -42,3 +42,53 @@ Test the ability to _not_ export a branch label (same logic as not exporting the
   $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades_no-labels.json" \
   >   --exclude-paths "root['generated_by']"
   {'dictionary_item_removed': [root['branches']]}
+
+
+A clade which exists at the root is not identified by inferring the root state
+(i.e. we don't infer the root state to be A if we observe a subsequent A10T mutation)
+This is an oversight and ideally would be fixed
+
+  $ ${AUGUR} clades \
+  >   --tree clades/toy_tree.nwk \
+  >   --mutations clades/toy_muts_no_ref.json \
+  >   --clades clades/toy_clades_nuc.tsv \
+  >   --output-node-data "$TMP/toy_clades_1.json" &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_1.json "$TMP/toy_clades_1.json" \
+  >   --exclude-paths "root['generated_by']"
+  {}
+
+A clade which exists at the root is identified (and correctly propogated) if the root sequence
+is explicitly set.
+
+  $ ${AUGUR} clades \
+  >   --tree clades/toy_tree.nwk \
+  >   --mutations clades/toy_muts_ref.json \
+  >   --clades clades/toy_clades_nuc.tsv \
+  >   --output-node-data "$TMP/toy_clades_2a.json" &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2a.json" \
+  >   --exclude-paths "root['generated_by']"
+  {}
+
+A clade which exists at the root is identified (and correctly propogated) without a root sequence
+if the (branch leading to the) root has the clade-defining mutation.
+
+  $ ${AUGUR} clades \
+  >   --tree clades/toy_tree.nwk \
+  >   --mutations clades/toy_muts_explicit_root_mutation.json \
+  >   --clades clades/toy_clades_nuc.tsv \
+  >   --output-node-data "$TMP/toy_clades_2b.json" &>/dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2b.json" \
+  >   --exclude-paths "root['generated_by']"
+  {}
+
+Multiple mutations at the same position on a single branch are a fatal error
+
+  $ ${AUGUR} clades \
+  >   --tree clades/toy_tree.nwk \
+  >   --mutations clades/toy_muts_multiple.json \
+  >   --clades clades/toy_clades_nuc.tsv
+  ERROR: Multiple mutations at the same position on a single branch were found: Node A (nuc), Node AB (geneName)
+  [2]
diff --git a/tests/functional/clades/toy_clades_1.json b/tests/functional/clades/toy_clades_1.json
@@ -0,0 +1,30 @@
+{
+  "branches": {
+    "A": {
+      "labels": {
+        "clade": "clade_2"
+      }
+    }
+  },
+  "generated_by": {
+    "program": "augur",
+    "version": "21.1.0"
+  },
+  "nodes": {
+    "A": {
+      "clade_membership": "clade_2"
+    },
+    "AB": {
+      "clade_membership": "unassigned"
+    },
+    "B": {
+      "clade_membership": "unassigned"
+    },
+    "C": {
+      "clade_membership": "unassigned"
+    },
+    "ROOT": {
+      "clade_membership": "unassigned"
+    }
+  }
+}
diff --git a/tests/functional/clades/toy_clades_2.json b/tests/functional/clades/toy_clades_2.json
@@ -0,0 +1,35 @@
+{
+  "branches": {
+    "A": {
+      "labels": {
+        "clade": "clade_2"
+      }
+    },
+    "ROOT": {
+      "labels": {
+        "clade": "clade_1"
+      }
+    }
+  },
+  "generated_by": {
+    "program": "augur",
+    "version": "21.1.0"
+  },
+  "nodes": {
+    "A": {
+      "clade_membership": "clade_2"
+    },
+    "AB": {
+      "clade_membership": "clade_1"
+    },
+    "B": {
+      "clade_membership": "clade_1"
+    },
+    "C": {
+      "clade_membership": "clade_1"
+    },
+    "ROOT": {
+      "clade_membership": "clade_1"
+    }
+  }
+}
diff --git a/tests/functional/clades/toy_clades_nuc.tsv b/tests/functional/clades/toy_clades_nuc.tsv
@@ -0,0 +1,3 @@
+clade	gene	site	alt
+clade_1	nuc	10	A
+clade_2	nuc	10	T
diff --git a/tests/functional/clades/toy_muts_explicit_root_mutation.json b/tests/functional/clades/toy_muts_explicit_root_mutation.json
@@ -0,0 +1,10 @@
+{
+  "nodes": {
+    "ROOT": {
+      "muts": ["C10A"]
+    },
+    "A": {
+      "muts": ["A10T"]
+    }
+  }
+}
diff --git a/tests/functional/clades/toy_muts_multiple.json b/tests/functional/clades/toy_muts_multiple.json
@@ -0,0 +1,15 @@
+{
+    "nodes": {
+      "A": {
+        "muts": ["A10T", "T10C"]
+      },
+      "AB": {
+        "aa_muts": {
+          "geneName": ["S42L", "R42H", "Y50W"]
+        }
+      },
+      "B": {
+        "muts": ["A10T", "T11C"]
+      }
+    }
+  }
diff --git a/tests/functional/clades/toy_muts_no_ref.json b/tests/functional/clades/toy_muts_no_ref.json
@@ -0,0 +1,7 @@
+{
+  "nodes": {
+    "A": {
+      "muts": ["A10T"]
+    }
+  }
+}