Skip to content

Commit

Permalink
added date parsing and tested timetree/ancestral modes, additional ar…
Browse files Browse the repository at this point in the history
…guments still need to be sorted out
  • Loading branch information
rneher committed May 6, 2018
1 parent def6220 commit 6dac7eb
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 16 deletions.
67 changes: 51 additions & 16 deletions augur/tree.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import os, shutil, time
from Bio import Phylo
from .utils import parse_metadata, meta_to_date_dict

def build_raxml(aln_file, out_file, clean_up=True, nthreads=2):
call = ["raxml","-f d -T",str(nthreads),"-m GTRCAT -c 25 -p 235813 -n tre -s",aln_file,"> raxml.log"]
'''
build tree using RAxML with parameters '-f d -m GTRCAT -c 25 -p 235813 -n tre"
'''
call = ["raxml","-T",str(nthreads)," -f d -m GTRCAT -c 25 -p 235813 -n tre -s",aln_file,"> raxml.log"]
cmd = " ".join(call)
print("Building a tree via:\n\t" + cmd +
"\n\tStamatakis, A: RAxML Version 8: A tool for Phylogenetic Analysis and Post-Analysis of Large Phylogenies."
Expand All @@ -20,6 +24,9 @@ def build_raxml(aln_file, out_file, clean_up=True, nthreads=2):
return T

def build_fasttree(aln_file, out_file, clean_up=True):
'''
build tree using fasttree with parameters "-nt"
'''
call = ["fasttree", "-nt", aln_file, "1>", out_file, "2>", "fasttree.log"]
cmd = " ".join(call)
print("Building a tree via:\n\t" + cmd +
Expand All @@ -38,11 +45,16 @@ def build_fasttree(aln_file, out_file, clean_up=True):


def build_iqtree(aln_file, out_file, iqmodel="HKY+F", clean_up=True, nthreads=2):
#return Phylo.read(out_file.replace(".nwk",".iqtree.nwk"), 'newick') #uncomment for debug skip straight to TreeTime

'''
build tree using IQ-Tree with parameters "-fast"
arguments:
aln_file file name of input aligment
out_file file name to write tree to
'''
with open(aln_file) as ifile:
tmp_seqs = ifile.readlines()

# IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later
aln_file = "temp_iqtree.fasta"
with open(aln_file, 'w') as ofile:
for line in tmp_seqs:
Expand All @@ -59,6 +71,8 @@ def build_iqtree(aln_file, out_file, iqmodel="HKY+F", clean_up=True, nthreads=2)
"\n\tNguyen et al: IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies."
"\n\tMol. Biol. Evol., 32:268-274. https://doi.org/10.1093/molbev/msu300\n")
os.system(cmd)

# Check result
try:
T = Phylo.read(aln_file+".treefile", 'newick')
shutil.copyfile(aln_file+".treefile", out_file)
Expand Down Expand Up @@ -99,7 +113,7 @@ def timetree(tree=None, aln=None, ref=None, dates=None, keeproot=False,
marginal = confidence

tt.run(infer_gtr=infer_gtr, root=reroot, Tc=Tc, time_marginal=marginal,
resolve_polytomies=resolve_polytomies, max_iter=max_iter, fixed_pi=pi, **kwarks)
resolve_polytomies=resolve_polytomies, max_iter=max_iter, fixed_pi=fixed_pi, **kwarks)

if confidence:
for n in T.find_clades():
Expand All @@ -109,23 +123,28 @@ def timetree(tree=None, aln=None, ref=None, dates=None, keeproot=False,


def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
optimize_branch_length=True):
marginal=False, optimize_branch_length=True):
from treetime import TreeAnc
tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69')

if optimize_branch_length:
tt.optimize_seq_and_branch_len(infer_gtr=infer_gtr)
tt.optimize_seq_and_branch_len(infer_gtr=infer_gtr, marginal=marginal)
else: # only infer ancestral sequences, leave branch length untouched
tt.infer_ancestral_sequences(infer_gtr=infer_gtr)
tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=marginal)

return tt

def prep_tree(T, attributes):
data = {}
for n in T.find_clades():
data[n.name] = {attr:n.__getattr__(attr) for attr in attributes if hasattr(n,attr)}
data[n.name] = {attr:n.__getattribute__(attr)
for attr in attributes if hasattr(n,attr)}
if 'mutations' in attributes:
for n in T.find_clades():
data[n.name]['mutations'] = [[a,int(pos),d] for a,pos,d in data[n.name]['mutations']]
return data


def run(args):
# check alignment type, construct reduced alignment if needed
if any([args.alignment.endswith(x) for x in ['.vcf', '.vcf.gz']]):
Expand Down Expand Up @@ -155,12 +174,12 @@ def run(args):
if args.output:
tree_fname = args.output
else:
tree_fname = '.'.join(args.alignment.split('.')) + '.nwk'
tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk'

if args.iqmodel and not args.method=='iqtree':
print("Cannot specify model unless using IQTree. Model specification ignored.")

args.tree_meta['topology method':args.method]
tree_meta['topology method'] = args.method
if args.method=='raxml':
T = build_raxml(aln, tree_fname, args.nthreads)
elif args.method=='iqtree':
Expand All @@ -171,12 +190,23 @@ def run(args):
print("Building original tree took {}".format(str(end-start)))

if args.timetree and T:
if args.metadata is None:
print("ERROR: meta data with dates is required for time tree reconstruction")
return -1
metadata = parse_metadata(args.metadata)
dates = meta_to_date_dict(metadata, fmt=args.date_fmt)

tt = timetree(tree=T, aln=aln, dates=dates, confidence=args.date_confidence)
attributes.extend(['numdate', 'clock_length', 'mutation_length'])
args.tree_meta['clock':{'rate':tt.date2dist.clock, 'intercept':tt.date2dist.intercept}]
tree_meta['clock'] = {'rate':tt.date2dist.clock_rate,
'intercept':tt.date2dist.intercept,
'rtt_Tmrca':-tt.date2dist.intercept/tt.date2dist.clock_rate}
attributes.extend(['numdate', 'clock_length', 'mutation_length', 'mutations'])
if args.date_confidence:
attributes.append('numdate_confidence')
elif args.ancestral in ['joint', 'marginal']:
tt = ancestral(tree=T, aln=aln, marginal=args.ancestral, optimize_branch_length=args.branchlengths=='div')
attributes.extend(['mutation_length'])
tt = ancestral_sequence_inference(tree=T, aln=aln, marginal=args.ancestral,
optimize_branch_length=args.branchlengths=='div')
attributes.extend(['mutation_length', 'mutations'])
else:
tt = None

Expand All @@ -185,8 +215,13 @@ def run(args):
if T:
import json
tree_success = Phylo.write(T, tree_fname, 'newick')
with open(args.output, 'w') as ofile:
meta_success = json.dump(tree_meta, ofile, indent=1)
if args.node_data:
node_data_fname = args.node_data
else:
node_data_fname = '.'.join(args.alignment.split('.')[:-1]) + '.node_data'

with open(node_data_fname, 'w') as ofile:
meta_success = json.dump(tree_meta, ofile)
return 0 if (tree_success and meta_success) else -1
else:
return -1
35 changes: 35 additions & 0 deletions augur/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd
from treetime.utils import numeric_date
from base.utils import ambiguous_date_to_date_range

def parse_metadata(fname):
metadata = pd.read_csv(fname, sep='\t' if fname[-3:]=='tsv' else ',',
skipinitialspace=True)
return metadata

def meta_to_date_dict(metadata, name_col = None, date_col='date', fmt=None):
possible_name_cols = ['strain', 'name'] if name_col is None else [name_col]
dates = None
for nc in possible_name_cols:
try:
dates = {row.loc[nc]:row.loc[date_col] for ri,row in metadata.iterrows()}
break
except:
pass

if dates is None:
print("ERROR: Can not find meta data column with strain names")
return None

if fmt:
from datetime import datetime
numerical_dates = {}
for k,v in dates.items():
if 'XX' in v:
numerical_dates[k] = [numeric_date(d) for d in ambiguous_date_to_date_range(v, fmt)]
else:
numerical_dates[k] = numeric_date(datetime.strptime(v, fmt))
else:
numerical_dates = {k:float(v) for k,v in dates.items()}

return numerical_dates
10 changes: 10 additions & 0 deletions bin/augur
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,17 @@ if __name__=="__main__":
tree_parser = subparsers.add_parser('tree', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
tree_parser.add_argument('--alignment', required=True, help="alignment to build tree from")
tree_parser.add_argument('--method', default='iqtree', choices=["fasttree", "raxml", "iqtree"], help="tree builder to use")
tree_parser.add_argument('--tree', help="prebuild newick -- no tree will be build if provided")
tree_parser.add_argument('--metadata', help="tsv/csv table with meta data for sequences")
tree_parser.add_argument('--timetree', action="store_true", help="produce timetree using treetime")
tree_parser.add_argument('--date_fmt', default="%Y-%m-%d", help="date format")
tree_parser.add_argument('--date_confidence', action="store_true", help="calculate confidence intervals for node dates")
tree_parser.add_argument('--ancestral', default="joint", choices=["joint", "marginal", "None"],
help="calculate joint maximum likelihood ancestral sequence states")
tree_parser.add_argument('--branchlengths', default='div', choices = ['div', 'clock'],
help='branch length of the output tree')
tree_parser.add_argument('--output', help='file name to write tree to')
tree_parser.add_argument('--node_data', help='file name to write additional node data')
tree_parser.add_argument('--iqmodel', default="HKY+F", help='substitution model to use for iq-tree')
tree_parser.add_argument('--nthreads', type=int, default=2,
help="number of threads used by mafft")
Expand Down

0 comments on commit 6dac7eb

Please sign in to comment.