Skip to content

Commit

Permalink
Merge pull request #357 from AngieHinrichs/collapse_and_fix
Browse files Browse the repository at this point in the history
matUtils updates: new subcommand "fix"; improvements to MAT::Tree:move_node and mask --move-nodes
  • Loading branch information
yatisht authored Nov 2, 2023
2 parents 002eb3a + d532e07 commit b96680c
Show file tree
Hide file tree
Showing 6 changed files with 411 additions and 85 deletions.
123 changes: 123 additions & 0 deletions src/matUtils/fix.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include "fix.hpp"

// Default: move node if it has at least 1 descendent
int default_min_descendent_count = 1;
int default_iterations = 1;

po::variables_map parse_fix_command(po::parsed_options parsed) {

po::variables_map vm;
po::options_description filt_desc("fix options");
filt_desc.add_options()
("input-mat,i", po::value<std::string>()->required(),
"Input mutation-annotated tree file [REQUIRED]")
("output-mat,o", po::value<std::string>()->required(),
"Path to output fixed mutation-annotated tree file [REQUIRED]")
("iterations,n", po::value<int>()->default_value(default_iterations),
"Number of iterations to run")
("min-descendent-count,c", po::value<int>()->default_value(default_min_descendent_count),
"Minimum number of descendents required to move a node")
("help,h", "Print help messages");
// Collect all the unrecognized options from the first pass. This will include the
// (positional) command name, so we need to erase that.
std::vector<std::string> opts = po::collect_unrecognized(parsed.options, po::include_positional);
opts.erase(opts.begin());

// Run the parser, with try/catch for help
try {
po::store(po::command_line_parser(opts)
.options(filt_desc)
.run(), vm);
po::notify(vm);
} catch(std::exception &e) {
std::cerr << filt_desc << std::endl;
// Return with error code 1 unless the user specifies help
if (vm.count("help"))
exit(0);
else
exit(1);
}
return vm;
}

static int fix_grandparent_reversions_r(MAT::Tree *T, MAT::Node *node, MAT::Node *ggp_node,
MAT::Node *gp_node, MAT::Node *p_node,
int min_descendent_count) {
// Recursively scan the tree for cases of grandparent-reversion, i.e. N > A > B > revA;
// when a case like that is found, move the revA node to be a child of N having mutation B.
int descendent_count = 0;
// First recursively descend to children, looking each one up by identifier in case it has
// been removed by the time we get to it:
std::vector<std::string> child_ids;
for (auto child: node->children) {
child_ids.push_back(child->identifier);
}
for (auto child_id: child_ids) {
MAT::Node *child = T->get_node(child_id);
if (child != NULL) {
descendent_count += fix_grandparent_reversions_r(T, child, gp_node, p_node, node,
min_descendent_count);
}
}
// Now determine whether this node has only one mutation that reverts its grandparent's only
// mutation; if so (and if parent has only one mut, othw parsimony score would increase),
// then move this node to be a child of its great-grandparent, with only its parent's mut.
if (ggp_node != NULL && node->mutations.size() == 1 && gp_node->mutations.size() == 1 &&
p_node->mutations.size() == 1) {
MAT::Mutation node_mut = node->mutations.front();
MAT::Mutation gp_mut = gp_node->mutations.front();
if (node_mut.position == gp_mut.position &&
node_mut.chrom == gp_mut.chrom &&
node_mut.mut_nuc == gp_mut.par_nuc &&
node_mut.par_nuc == gp_mut.mut_nuc &&
descendent_count >= min_descendent_count) {
fprintf(stderr, "Node %s mutation %s reverts grandparent %s's %s%s, moving to %s with %s (%d descendents)\n",
node->identifier.c_str(), node_mut.get_string().c_str(),
gp_node->identifier.c_str(),
(gp_mut.mut_nuc == gp_mut.ref_nuc ? "reversion " : ""),
gp_mut.get_string().c_str(),
ggp_node->identifier.c_str(),
p_node->mutations.front().get_string().c_str(), descendent_count);
node->mutations.clear();
node->mutations = std::vector<MAT::Mutation>(p_node->mutations);
T->move_node(node->identifier, ggp_node->identifier);
}
}
return descendent_count + 1;
}

static void fix_grandparent_reversions(MAT::Tree *T, int iterations, int min_descendent_count) {
// Find and fix nodes that reverse their grandparent's mutation.
// For each case of N > A > B > revA, move revA to N > B (if N > B already exists then move all
// children of revA to N > B).
int i;
for (i = 0; i < iterations; i++) {
fix_grandparent_reversions_r(T, T->root, NULL, NULL, NULL, min_descendent_count);
}
}

void fix_main(po::parsed_options parsed) {
po::variables_map vm = parse_fix_command(parsed);
std::string input_mat_filename = vm["input-mat"].as<std::string>();
std::string output_mat_filename = vm["output-mat"].as<std::string>();
int iterations = vm["iterations"].as<int>();
int min_descendent_count = vm["min-descendent-count"].as<int>();

// Load the input MAT
timer.Start();
fprintf(stderr, "Loading input MAT file %s.\n", input_mat_filename.c_str());
MAT::Tree T = MAT::load_mutation_annotated_tree(input_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());

// No need to uncondense the tree

timer.Start();
fprintf(stderr, "Fixing grandparent-reversions\n");
fix_grandparent_reversions(&T, iterations, min_descendent_count);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());

timer.Start();
fprintf(stderr, "Saving Final Tree\n");
MAT::save_mutation_annotated_tree(T, output_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}
3 changes: 3 additions & 0 deletions src/matUtils/fix.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#include "common.hpp"

void fix_main(po::parsed_options parsed);
8 changes: 6 additions & 2 deletions src/matUtils/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "extract.hpp"
#include "merge.hpp"
#include "introduce.hpp"
#include "fix.hpp"
#include "version.hpp"
#include <cstddef>

Expand All @@ -12,15 +13,15 @@ Timer timer;
int main (int argc, char** argv) {
po::options_description global("Command options");
global.add_options()
("command", po::value<std::string>(), "Command to execute. Valid options are annotate, mask, extract, uncertainty, and summary.")
("command", po::value<std::string>(), "Command to execute. Valid options are annotate, mask, extract, uncertainty, introduce, fix, merge, version, and summary.")
("subargs", po::value<std::vector<std::string> >(), "Command-specific arguments.");
po::positional_options_description pos;
pos.add("command",1 ).add("subargs", -1);
std::string cmd;
po::variables_map vm;
po::parsed_options parsed = po::command_line_parser(argc, argv).options(global).positional(pos).allow_unregistered().run();
//this help string shows up over and over, lets just define it once
std::string cnames[] = {"COMMAND","summary","extract","annotate","uncertainty","introduce", "merge", "mask", "version"};
std::string cnames[] = {"COMMAND","summary","extract","annotate","uncertainty","introduce", "merge", "mask", "fix", "version"};
std::string chelp[] = {
"DESCRIPTION\n\n",
"calculates basic statistics and counts samples, mutations, and clades in the input MAT\n\n",
Expand All @@ -30,6 +31,7 @@ int main (int argc, char** argv) {
"given sample region information, heuristically identifies points of geographic introduction along the phylogeny\n\n",
"merge all samples of two input MAT files into a single output MAT \n\n",
"masks the input samples\n\n",
"fixes grandparent-reversion structures\n\n",
"display version number\n\n"
};
try {
Expand Down Expand Up @@ -57,6 +59,8 @@ int main (int argc, char** argv) {
introduce_main(parsed);
} else if (cmd == "mask") {
mask_main(parsed);
} else if (cmd == "fix") {
fix_main(parsed);
} else if (cmd == "version") {
std::cerr << "matUtils (v" << PROJECT_VERSION << ")" << std::endl;
} else if (cmd == "help") {
Expand Down
155 changes: 114 additions & 41 deletions src/matUtils/mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,16 @@ void mask_main(po::parsed_options parsed) {
exit(1);
}
// Load input MAT and uncondense tree
fprintf(stderr, "Loading input MAT file %s.\n", input_mat_filename.c_str());
timer.Start();
MAT::Tree T = MAT::load_mutation_annotated_tree(input_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
//T here is the actual object.
if (T.condensed_nodes.size() > 0) {
fprintf(stderr, "Uncondensing condensed nodes.\n");
timer.Start();
T.uncondense_leaves();
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}

// If a restricted samples file was provided, perform masking procedure
Expand Down Expand Up @@ -103,12 +109,20 @@ void mask_main(po::parsed_options parsed) {

// Store final MAT to output file
if (output_mat_filename != "") {
fprintf(stderr, "Saving Final Tree\n");
if (recondense) {
timer.Start();
fprintf(stderr, "Collapsing tree...\n");
T.collapse_tree();
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
timer.Start();
fprintf(stderr, "Condensing leaves...\n");
T.condense_leaves();
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}
fprintf(stderr, "Saving Final Tree to %s\n", output_mat_filename.c_str());
timer.Start();
MAT::save_mutation_annotated_tree(T, output_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}
}

Expand Down Expand Up @@ -211,6 +225,8 @@ void restrictMutationsLocally (std::string mutations_filename, MAT::Tree* T, boo
delim = ',';
}
std::string rootid = T->root->identifier;
size_t total_masked = 0;
timer.Start();
while (std::getline(infile, line)) {
std::vector<std::string> words;
if (line[line.size()-1] == '\r') {
Expand All @@ -236,19 +252,27 @@ void restrictMutationsLocally (std::string mutations_filename, MAT::Tree* T, boo
}
// fprintf(stderr, "Masking mutation %s below node %s\n", ml.first.c_str(), ml.second.c_str());
for (auto n: T->depth_first_expansion(rn)) {
std::vector<MAT::Mutation> nmuts;
// The expected common case is to not match any mutations and have nothing to remove.
std::vector<MAT::Mutation> muts_to_remove;
for (auto& mut: n->mutations) {
if (match_mutations(mutobj, &mut)) {
instances_masked++;
} else {
nmuts.push_back(mut);
muts_to_remove.push_back(mut);
}
}
n->mutations = nmuts;
for (auto mut: muts_to_remove) {
auto iter = std::find(n->mutations.begin(), n->mutations.end(), mut);
n->mutations.erase(iter);
}
}
total_masked += instances_masked;
}
fprintf(stderr, "Completed in %ld msec \n", timer.Stop());
infile.close();
fprintf(stderr, "Masked a total of %lu mutations. Collapsing tree...\n", total_masked);
timer.Start();
T->collapse_tree();
fprintf(stderr, "Completed in %ld msec \n", timer.Stop());
}

void restrictSamples (std::string samples_filename, MAT::Tree& T) {
Expand Down Expand Up @@ -356,6 +380,43 @@ void restrictSamples (std::string samples_filename, MAT::Tree& T) {
}
}

std::unordered_set<std::string> mutation_set_from_node(MAT::Tree* T, MAT::Node* node, bool include_node, bool include_ancestors) {
std::unordered_set<std::string> mutations;
if (include_ancestors) {
for (auto an: T->rsearch(node->identifier, include_node)) {
for (auto mut: an->mutations) {
if (mut.is_masked()) {
continue;
}
MAT::Mutation mut_opposite = mut.copy();
//we check for whether this mutation is going to negate with something in the set
//by identifying its opposite and checking whether the opposite is already present on the traversal.
mut_opposite.par_nuc = mut.mut_nuc;
mut_opposite.mut_nuc = mut.par_nuc;
auto cml = mutations.find(mut_opposite.get_string());
if (cml != mutations.end()) {
mutations.erase(cml);
} else {
mutations.insert(mut.get_string());
}
}
}
} else if (include_node) {
for (auto mut: node->mutations) {
if (mut.is_masked()) {
continue;
}
mutations.insert(mut.get_string());
}
} else {
fprintf(stderr, "ERROR: mutation_set_from_node: at least one of include_node and include_ancestors should be true.\n");
exit(1);
}
return mutations;
}



void moveNodes (std::string node_filename, MAT::Tree* T) {
// Function to move nodes between two identical placement paths. That is, move the target node so that is a child of the indicated new parent node,
// but the current placement and new placement must involve exactly the same set of mutations for the move to be allowed.
Expand Down Expand Up @@ -394,49 +455,61 @@ void moveNodes (std::string node_filename, MAT::Tree* T) {
}
//accumulate the set of mutations belonging to the current and the new placement
//not counting mutations belonging to the target, but counting ones to the putative new parent.
std::unordered_set<std::string> curr_mutations;
std::unordered_set<std::string> new_mutations;
for (auto an: T->rsearch(mn->identifier, false)) {
for (auto mut: an->mutations) {
if (mut.is_masked()) {
continue;
}
MAT::Mutation mut_opposite = mut.copy();
//we check for whether this mutation is going to negate with something in the set
//by identifying its opposite and checking whether the opposite is already present on the traversal.
mut_opposite.par_nuc = mut.mut_nuc;
mut_opposite.mut_nuc = mut.par_nuc;
auto cml = curr_mutations.find(mut_opposite.get_string());
if (cml != curr_mutations.end()) {
curr_mutations.erase(cml);
std::unordered_set<std::string> curr_mutations = mutation_set_from_node(T, mn, false, true);
std::unordered_set<std::string> new_mutations = mutation_set_from_node(T, np, true, true);
if (curr_mutations == new_mutations) {
//we can now proceed with the move.
T->move_node(mn->identifier, np->identifier);
fprintf(stderr, "Move of node %s to node %s successful.\n", words[0].c_str(), words[1].c_str());
} else {
// Not quite the same; figure out whether we need to add a node under new parent.
fprintf(stderr, "The current (%s) and new (%s) node paths do not involve the same set of mutations.\n",
mn->identifier.c_str(), np->identifier.c_str());

std::unordered_set<std::string> extra_mutations;
size_t curr_in_new_count = 0;
for (auto mut: curr_mutations) {
if (new_mutations.find(mut) != new_mutations.end()) {
curr_in_new_count++;
} else {
curr_mutations.insert(mut.get_string());
extra_mutations.insert(mut);
}
}
}
for (auto an: T->rsearch(np->identifier, true)) {
for (auto mut: an->mutations) {
if (mut.is_masked()) {
continue;
if (extra_mutations.size() == 0 || curr_in_new_count != new_mutations.size()) {
fprintf(stderr, "ERROR: the new parent (%s) has mutations not found in the current node (%s); %ld in common, %ld in new\n",
np->identifier.c_str(), mn->identifier.c_str(), curr_in_new_count, new_mutations.size());
exit(1);
}
// Look for a child of np that already has extra_mutations. If there is such a child
// then move mn to that child. Otherwise add those mutations to mn and move it to np.
MAT::Node *child_with_muts = NULL;
for (auto child: np->children) {
std::unordered_set<std::string> mut_set = mutation_set_from_node(T, child, true, false);
if (mut_set == extra_mutations) {
child_with_muts = child;
fprintf(stderr, "Found child with extra_mutations: %s\n", child->identifier.c_str());
break;
}
MAT::Mutation mut_opposite = mut.copy();
mut_opposite.par_nuc = mut.mut_nuc;
mut_opposite.mut_nuc = mut.par_nuc;
auto cml = new_mutations.find(mut_opposite.get_string());
if (cml != new_mutations.end()) {
new_mutations.erase(cml);
} else {
new_mutations.insert(mut.get_string());
}
if (child_with_muts != NULL) {
T->move_node(mn->identifier, child_with_muts->identifier);
} else {
// Preserve chronological order expected by add_mutation by adding mn's mutations
// after extra_mutations instead of vice versa.
std::vector<MAT::Mutation>mn_mutations;
for (auto mut: mn->mutations) {
mn_mutations.push_back(mut);
}
mn->mutations.clear();
for (auto mut: extra_mutations) {
mn->add_mutation(*(MAT::mutation_from_string(mut)));
}
for (auto mut: mn_mutations) {
mn->add_mutation(mut);
}
T->move_node(mn->identifier, np->identifier);
}
}
if (curr_mutations != new_mutations) {
fprintf(stderr, "ERROR: The current and new placement paths do not involve the same set of mutations. Exiting\n");
exit(1);
}
//we can now proceed with the move.
T->move_node(mn->identifier, np->identifier);
fprintf(stderr, "Move of node %s to node %s successful.\n", words[0].c_str(), words[1].c_str());
}
fprintf(stderr, "All requested moves complete.\n");
}
Loading

0 comments on commit b96680c

Please sign in to comment.