Skip to content

Commit

Permalink
Close #402 by adding coverage and mutation prevalence.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Aug 23, 2017
1 parent 8cb8b34 commit 485c663
Show file tree
Hide file tree
Showing 5 changed files with 391 additions and 345 deletions.
18 changes: 13 additions & 5 deletions micall/core/aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,8 @@ def _create_amino_writer(amino_file):
'query.nuc.pos',
'refseq.aa.pos']
columns.extend(AMINO_ALPHABET)
columns.extend(('X', 'partial', 'del', 'ins', 'clip', 'g2p_overlap'))
columns.extend(
('X', 'partial', 'del', 'ins', 'clip', 'g2p_overlap', 'coverage'))
return csv.DictWriter(amino_file,
columns,
lineterminator=os.linesep)
Expand Down Expand Up @@ -446,11 +447,13 @@ def write_amino_counts(self, amino_writer=None, coverage_summary=None):
'del': seed_amino.deletions,
'ins': report_amino.insertion_count,
'clip': report_amino.max_clip_count,
'g2p_overlap': seed_amino.g2p_overlap}
'g2p_overlap': seed_amino.g2p_overlap,
'coverage': seed_amino.deletions}
for letter in AMINO_ALPHABET:
letter_count = seed_amino.counts[letter]
row[letter] = letter_count
coverage_sum += letter_count
row['coverage'] += letter_count
amino_writer.writerow(row)
pos_count += 1
if coverage_summary is not None and pos_count > 0:
Expand All @@ -477,7 +480,8 @@ def _create_nuc_writer(nuc_file):
'del',
'ins',
'clip',
'g2p_overlap'],
'g2p_overlap',
'coverage'],
lineterminator=os.linesep)

def write_nuc_header(self, nuc_file):
Expand Down Expand Up @@ -521,9 +525,13 @@ def write_counts(self, region, seed_amino, report_amino, nuc_writer):
'del': seed_nuc.counts['-'],
'ins': insertion_count,
'clip': clip_count,
'g2p_overlap': seed_nuc.g2p_overlap}
'g2p_overlap': seed_nuc.g2p_overlap,
'coverage': seed_nuc.counts['-']}
for base in 'ACTGN':
row[base] = seed_nuc.counts[base]
nuc_count = seed_nuc.counts[base]
row[base] = nuc_count
if base != 'N':
row['coverage'] += nuc_count
nuc_writer.writerow(row)
if report_amino is not None:
report_amino.max_clip_count = max_clip_count
Expand Down
10 changes: 7 additions & 3 deletions micall/hivdb/genreport.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from argparse import ArgumentParser, FileType
import csv
from collections import defaultdict

import yaml

try:
Expand Down Expand Up @@ -142,7 +144,7 @@ def read_mutations(cfg_dct, csv_file):
Returns a list of dictionaries.
"""
err_string = "Error in mutations file '{}'".format(csv_file.name)
exp_set = frozenset("drug_class,mutation".split(","))
exp_set = frozenset("drug_class,mutation,prevalence".split(","))
try:
data_lst = list(csv.DictReader(csv_file, restkey="dummy"))
except csv.Error:
Expand All @@ -154,7 +156,7 @@ def read_mutations(cfg_dct, csv_file):
# simple sanity check of the fields
# generate a dict of drug_class -> string of mutations
# also set the strings to 'NONE' if appropriate
tmp_dct = {}
tmp_dct = defaultdict(list)
known_drug_class_set = cfg_dct['known_drug_classes']
glob_err = False
for od_num, od in enumerate(data_lst):
Expand All @@ -167,7 +169,9 @@ def read_mutations(cfg_dct, csv_file):
print("{}: dataset {}: error with {}\n\n".format(
err_string, od_num + 1, od))
glob_err = True
tmp_dct.setdefault(d_class, []).append(mut_str)
tmp_dct[d_class].append('{}({:.0f}%)'.format(
mut_str,
100*float(od['prevalence'])))
if glob_err:
raise RuntimeError(
"{}: fatal error(s) detected: giving up...".format(err_string))
Expand Down
17 changes: 11 additions & 6 deletions micall/hivdb/hivdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,14 @@ def read_aminos(amino_csv, min_fraction, reported_regions=None):
aminos = []
for row in rows:
counts = list(map(int, (row[f] for f in coverage_columns)))
coverage = sum(counts)
coverage = int(row['coverage'])
min_count = max(1, coverage * min_fraction) # needs at least 1
pos_aminos = [report_names[i]
pos_aminos = {report_names[i]: count/coverage
for i, count in enumerate(counts)
if count >= min_count and report_names[i] != '*']
if count >= min_count and report_names[i] != '*'}
ins_count = int(row['ins'])
if ins_count >= min_count:
pos_aminos.append('i')
pos_aminos['i'] = ins_count / coverage
aminos.append(pos_aminos)
yield translated_region, aminos
for region in missing_regions:
Expand Down Expand Up @@ -109,7 +109,7 @@ def write_resistance(aminos, resistance_csv, mutations_csv):
lineterminator=os.linesep)
resistance_writer.writeheader()
mutations_writer = DictWriter(mutations_csv,
['drug_class', 'mutation'],
['drug_class', 'mutation', 'prevalence'],
lineterminator=os.linesep)
mutations_writer.writeheader()
asi = AsiAlgorithm(RULES_PATH)
Expand All @@ -128,8 +128,13 @@ def write_resistance(aminos, resistance_csv, mutations_csv):
score=drug_result.score))
for drug_class, class_mutations in result.mutations.items():
for mutation in class_mutations:
amino = mutation[-1]
pos = int(mutation[1:-1])
pos_aminos = amino_seq[pos-1]
prevalence = pos_aminos[amino]
mutations_writer.writerow(dict(drug_class=drug_class,
mutation=mutation))
mutation=mutation,
prevalence=prevalence))


def hivdb(amino_csv,
Expand Down
Loading

0 comments on commit 485c663

Please sign in to comment.