Skip to content

Commit

Permalink
New matUtils subcommand fix: finds nodes whose only mutation is a rev…
Browse files Browse the repository at this point in the history
…ersion of their grandparent's only mutation, a pattern that causes trouble for several Pango lineages, and moves those nodes to become children of their great-grandparents, having only the parent's mutation.
  • Loading branch information
AngieHinrichs committed Nov 1, 2023
1 parent 817202e commit d532e07
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 2 deletions.
123 changes: 123 additions & 0 deletions src/matUtils/fix.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include "fix.hpp"

// Default: move node if it has at least 1 descendent
int default_min_descendent_count = 1;
int default_iterations = 1;

po::variables_map parse_fix_command(po::parsed_options parsed) {

po::variables_map vm;
po::options_description filt_desc("fix options");
filt_desc.add_options()
("input-mat,i", po::value<std::string>()->required(),
"Input mutation-annotated tree file [REQUIRED]")
("output-mat,o", po::value<std::string>()->required(),
"Path to output fixed mutation-annotated tree file [REQUIRED]")
("iterations,n", po::value<int>()->default_value(default_iterations),
"Number of iterations to run")
("min-descendent-count,c", po::value<int>()->default_value(default_min_descendent_count),
"Minimum number of descendents required to move a node")
("help,h", "Print help messages");
// Collect all the unrecognized options from the first pass. This will include the
// (positional) command name, so we need to erase that.
std::vector<std::string> opts = po::collect_unrecognized(parsed.options, po::include_positional);
opts.erase(opts.begin());

// Run the parser, with try/catch for help
try {
po::store(po::command_line_parser(opts)
.options(filt_desc)
.run(), vm);
po::notify(vm);
} catch(std::exception &e) {
std::cerr << filt_desc << std::endl;
// Return with error code 1 unless the user specifies help
if (vm.count("help"))
exit(0);
else
exit(1);
}
return vm;
}

static int fix_grandparent_reversions_r(MAT::Tree *T, MAT::Node *node, MAT::Node *ggp_node,
MAT::Node *gp_node, MAT::Node *p_node,
int min_descendent_count) {
// Recursively scan the tree for cases of grandparent-reversion, i.e. N > A > B > revA;
// when a case like that is found, move the revA node to be a child of N having mutation B.
int descendent_count = 0;
// First recursively descend to children, looking each one up by identifier in case it has
// been removed by the time we get to it:
std::vector<std::string> child_ids;
for (auto child: node->children) {
child_ids.push_back(child->identifier);
}
for (auto child_id: child_ids) {
MAT::Node *child = T->get_node(child_id);
if (child != NULL) {
descendent_count += fix_grandparent_reversions_r(T, child, gp_node, p_node, node,
min_descendent_count);
}
}
// Now determine whether this node has only one mutation that reverts its grandparent's only
// mutation; if so (and if parent has only one mut, othw parsimony score would increase),
// then move this node to be a child of its great-grandparent, with only its parent's mut.
if (ggp_node != NULL && node->mutations.size() == 1 && gp_node->mutations.size() == 1 &&
p_node->mutations.size() == 1) {
MAT::Mutation node_mut = node->mutations.front();
MAT::Mutation gp_mut = gp_node->mutations.front();
if (node_mut.position == gp_mut.position &&
node_mut.chrom == gp_mut.chrom &&
node_mut.mut_nuc == gp_mut.par_nuc &&
node_mut.par_nuc == gp_mut.mut_nuc &&
descendent_count >= min_descendent_count) {
fprintf(stderr, "Node %s mutation %s reverts grandparent %s's %s%s, moving to %s with %s (%d descendents)\n",
node->identifier.c_str(), node_mut.get_string().c_str(),
gp_node->identifier.c_str(),
(gp_mut.mut_nuc == gp_mut.ref_nuc ? "reversion " : ""),
gp_mut.get_string().c_str(),
ggp_node->identifier.c_str(),
p_node->mutations.front().get_string().c_str(), descendent_count);
node->mutations.clear();
node->mutations = std::vector<MAT::Mutation>(p_node->mutations);
T->move_node(node->identifier, ggp_node->identifier);
}
}
return descendent_count + 1;
}

static void fix_grandparent_reversions(MAT::Tree *T, int iterations, int min_descendent_count) {
// Find and fix nodes that reverse their grandparent's mutation.
// For each case of N > A > B > revA, move revA to N > B (if N > B already exists then move all
// children of revA to N > B).
int i;
for (i = 0; i < iterations; i++) {
fix_grandparent_reversions_r(T, T->root, NULL, NULL, NULL, min_descendent_count);
}
}

void fix_main(po::parsed_options parsed) {
po::variables_map vm = parse_fix_command(parsed);
std::string input_mat_filename = vm["input-mat"].as<std::string>();
std::string output_mat_filename = vm["output-mat"].as<std::string>();
int iterations = vm["iterations"].as<int>();
int min_descendent_count = vm["min-descendent-count"].as<int>();

// Load the input MAT
timer.Start();
fprintf(stderr, "Loading input MAT file %s.\n", input_mat_filename.c_str());
MAT::Tree T = MAT::load_mutation_annotated_tree(input_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());

// No need to uncondense the tree

timer.Start();
fprintf(stderr, "Fixing grandparent-reversions\n");
fix_grandparent_reversions(&T, iterations, min_descendent_count);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());

timer.Start();
fprintf(stderr, "Saving Final Tree\n");
MAT::save_mutation_annotated_tree(T, output_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}
3 changes: 3 additions & 0 deletions src/matUtils/fix.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#include "common.hpp"

void fix_main(po::parsed_options parsed);
8 changes: 6 additions & 2 deletions src/matUtils/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "extract.hpp"
#include "merge.hpp"
#include "introduce.hpp"
#include "fix.hpp"
#include "version.hpp"
#include <cstddef>

Expand All @@ -12,15 +13,15 @@ Timer timer;
int main (int argc, char** argv) {
po::options_description global("Command options");
global.add_options()
("command", po::value<std::string>(), "Command to execute. Valid options are annotate, mask, extract, uncertainty, and summary.")
("command", po::value<std::string>(), "Command to execute. Valid options are annotate, mask, extract, uncertainty, introduce, fix, merge, version, and summary.")
("subargs", po::value<std::vector<std::string> >(), "Command-specific arguments.");
po::positional_options_description pos;
pos.add("command",1 ).add("subargs", -1);
std::string cmd;
po::variables_map vm;
po::parsed_options parsed = po::command_line_parser(argc, argv).options(global).positional(pos).allow_unregistered().run();
//this help string shows up over and over, lets just define it once
std::string cnames[] = {"COMMAND","summary","extract","annotate","uncertainty","introduce", "merge", "mask", "version"};
std::string cnames[] = {"COMMAND","summary","extract","annotate","uncertainty","introduce", "merge", "mask", "fix", "version"};
std::string chelp[] = {
"DESCRIPTION\n\n",
"calculates basic statistics and counts samples, mutations, and clades in the input MAT\n\n",
Expand All @@ -30,6 +31,7 @@ int main (int argc, char** argv) {
"given sample region information, heuristically identifies points of geographic introduction along the phylogeny\n\n",
"merge all samples of two input MAT files into a single output MAT \n\n",
"masks the input samples\n\n",
"fixes grandparent-reversion structures\n\n",
"display version number\n\n"
};
try {
Expand Down Expand Up @@ -57,6 +59,8 @@ int main (int argc, char** argv) {
introduce_main(parsed);
} else if (cmd == "mask") {
mask_main(parsed);
} else if (cmd == "fix") {
fix_main(parsed);
} else if (cmd == "version") {
std::cerr << "matUtils (v" << PROJECT_VERSION << ")" << std::endl;
} else if (cmd == "help") {
Expand Down

0 comments on commit d532e07

Please sign in to comment.