From e5cfc3a4a0e71d7decd8859e908a2c80f6f9a909 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Tue, 11 Apr 2023 20:56:18 +1200 Subject: [PATCH] [clades] check for multiple mutations at same pos Multiple mutations at the same position on a single branch are now a fatal error. Previous behaviour was to overwrite such mutations when parsing. Suggested by #735. --- augur/clades.py | 15 +++++++++++++++ tests/functional/clades.t | 11 ++++++++++- tests/functional/clades/toy_muts_multiple.json | 15 +++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 tests/functional/clades/toy_muts_multiple.json diff --git a/augur/clades.py b/augur/clades.py index 30c43e8b7..ab8614d09 100644 --- a/augur/clades.py +++ b/augur/clades.py @@ -170,6 +170,20 @@ def is_node_in_clade(clade_alleles, node, root_sequence): return all(conditions) +def ensure_no_multiple_mutations(all_muts): + multiples = [] + + for name,node in all_muts.items(): + nt_positions = [int(mut[1:-1])-1 for mut in node.get('muts', [])] + if len(set(nt_positions))!=len(nt_positions): + multiples.append(f"Node {name} (nuc)") + for gene in node.get('aa_muts', {}): + aa_positions = [int(mut[1:-1])-1 for mut in node['aa_muts'][gene]] + if len(set(aa_positions))!=len(aa_positions): + multiples.append(f"Node {name} ({gene})") + + if multiples: + raise AugurError(f"Multiple mutations at the same position on a single branch were found: {', '.join(multiples)}") def assign_clades(clade_designations, all_muts, tree, ref=None): ''' @@ -314,6 +328,7 @@ def parse_nodes(tree_file, node_data_files): tree_nodes = set([clade.name for clade in tree.find_clades()]) if not json_nodes.issubset(tree_nodes): raise AugurError(f"The following nodes in the node_data files ({', '.join(node_data_files)}) are not found in the tree ({tree_file}): {', '.join(json_nodes - tree_nodes)}") + ensure_no_multiple_mutations(node_data['nodes']) return (tree, node_data['nodes']) def register_parser(parent_subparsers): diff --git a/tests/functional/clades.t b/tests/functional/clades.t index 2b4bd759c..0dca3dc13 100644 --- a/tests/functional/clades.t +++ b/tests/functional/clades.t @@ -82,4 +82,13 @@ if the (branch leading to the) root has the clade-defining mutation. $ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/toy_clades_2.json "$TMP/toy_clades_2b.json" \ > --exclude-paths "root['generated_by']" - {} \ No newline at end of file + {} + +Multiple mutations at the same position on a single branch are a fatal error + + $ ${AUGUR} clades \ + > --tree clades/toy_tree.nwk \ + > --mutations clades/toy_muts_multiple.json \ + > --clades clades/toy_clades_nuc.tsv + ERROR: Multiple mutations at the same position on a single branch were found: Node A (nuc), Node AB (geneName) + [2] \ No newline at end of file diff --git a/tests/functional/clades/toy_muts_multiple.json b/tests/functional/clades/toy_muts_multiple.json new file mode 100644 index 000000000..0688cee89 --- /dev/null +++ b/tests/functional/clades/toy_muts_multiple.json @@ -0,0 +1,15 @@ +{ + "nodes": { + "A": { + "muts": ["A10T", "T10C"] + }, + "AB": { + "aa_muts": { + "geneName": ["S42L", "R42H", "Y50W"] + } + }, + "B": { + "muts": ["A10T", "T11C"] + } + } + } \ No newline at end of file