diff --git a/augur/distance.py b/augur/distance.py index a24294407..a558dd573 100644 --- a/augur/distance.py +++ b/augur/distance.py @@ -245,15 +245,29 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map) >>> distance_map = {"default": 0.0, "map": {"gene": {2: {('T', 'G'): 0.5}}}} >>> get_distance_between_nodes(node_b_sequences, node_a_sequences, distance_map) 0.0 + + Ignore specific characters defined in the distance map. + + >>> node_a_sequences = {"gene": "ACTGG"} + >>> node_b_sequences = {"gene": "A--GN"} + >>> distance_map = {"default": 1, "ignored_characters":["-"], "map": {}} + >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map) + 1 + >>> distance_map = {"default": 1, "ignored_characters":["-", "N"], "map": {}} + >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map) + 0 """ distance_type = type(distance_map["default"]) distance = distance_type(0) + ignored_characters = distance_map.get("ignored_characters",[]) for gene in node_a_sequences: gene_length = len(node_a_sequences[gene]) for site in range(gene_length): - if node_a_sequences[gene][site] != node_b_sequences[gene][site]: + if (node_a_sequences[gene][site] != node_b_sequences[gene][site] + and node_a_sequences[gene][site] not in ignored_characters + and node_b_sequences[gene][site] not in ignored_characters): if gene in distance_map["map"] and site in distance_map["map"][gene]: # Distances can be provided as either site- and # sequence-specific dictionaries of sequence pairs to