Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync from bench #29

Merged
merged 30 commits into from
Apr 7, 2021
Merged
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
81683be
baraaorabi Feb 12, 2021
a8f9898
minor changes
baraaorabi Feb 12, 2021
1f3efa1
including LTR-sim as submod
baraaorabi Feb 17, 2021
667abed
upto mapping is automated
baraaorabi Feb 17, 2021
10de8a8
Fixed issue with where there are too many contigs
baraaorabi Feb 17, 2021
43cffc6
Snakefile is good; need to fix freddie_isoforms.py
baraaorabi Feb 18, 2021
854bfbe
isoform collapsing is done; reduced cluster noise
baraaorabi Feb 18, 2021
3661688
Generating transcripts and beds
baraaorabi Feb 26, 2021
04d0132
Generating all baseline fastas and beds
baraaorabi Mar 3, 2021
fa9fe18
bugfix
baraaorabi Mar 3, 2021
14d58f7
We have overlaps too
baraaorabi Mar 5, 2021
1b3cdd1
Done with running tools
baraaorabi Mar 5, 2021
a50549a
rm files; edges are ready
baraaorabi Mar 8, 2021
b70ee93
Bugfix
baraaorabi Mar 10, 2021
772ed12
Accuracy ploting
baraaorabi Mar 10, 2021
d8b5907
accuracy integrated into snakemake
baraaorabi Mar 22, 2021
8ff243e
fully specified env add
baraaorabi Mar 22, 2021
6239af6
Merge branch 'benchmarking' of https://github.com/vpc-ccg/freddie int…
baraaorabi Mar 22, 2021
6203ab0
removed prefix from env
baraaorabi Mar 22, 2021
cc46660
Split conda envs, all working till gtime now
baraaorabi Mar 24, 2021
0a4a820
Made a switch to parallel for mini tasks
baraaorabi Mar 26, 2021
2b7b922
fix in fa gen; acc py gens dirs; acc sm in gpara
baraaorabi Mar 30, 2021
d00ca5e
bugfix for baseline acc
baraaorabi Apr 7, 2021
c95d3ba
Delete .gitmodules
baraaorabi Apr 7, 2021
36abb00
Delete Snakefile-accuracy
baraaorabi Apr 7, 2021
021a0aa
Delete accuracy.yml
baraaorabi Apr 7, 2021
d1f6449
Delete flair.yml
baraaorabi Apr 7, 2021
cacafa0
Delete run_tools.yml
baraaorabi Apr 7, 2021
e057b87
restore Snakefile
baraaorabi Apr 7, 2021
353cf9a
Merge branch 'master' into sync_from_bench
baraaorabi Apr 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Accuracy ploting
  • Loading branch information
baraaorabi committed Mar 10, 2021
commit 772ed125c86caa3d726d5b08f7d281254fdfd06c
261 changes: 261 additions & 0 deletions py/freddie_accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
#!/usr/bin/env python3
import argparse
import glob
from networkx.algorithms import clique

import numpy as np
import networkx as nx
# import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

BIP_TRUTH = 0
BIP_TOOL = 1


def parse_args():
default_thresholds = list(np.arange(0.85, 0.95, 0.05)) + \
list(np.arange(0.95, 1.00, 0.02))
default_thresholds = [round(t, 2) for t in default_thresholds]
parser = argparse.ArgumentParser(
description="Outputs accuracy plots for different tools against the simulated grount truth")
parser.add_argument("-id",
"--input-dir",
type=str,
required=True,
help="Input directory for edges TSV and BED files")
parser.add_argument("-od",
"--out-dir",
type=str,
default='freddie_accuracy/',
help="Path to output plots. Default: freddie_accuracy/")
parser.add_argument("-t",
"--thresholds",
nargs="+",
type=float,
default=default_thresholds,
help="Space separated list of alignment similarity thresholds. Default: {}".format(
' '.join(map(str, default_thresholds)))
)
args = parser.parse_args()
return args


def read_nodes_and_edges(in_dir):
tool_nodes = dict()
tool_edges = dict()
for bed in glob.iglob('{}/*.bed'.format(in_dir)):
tool = bed.split('/')[-1][:-len('.bed')]
tool_nodes[tool] = set()
for l in open(bed):
n = l.rstrip().split('\t')[3]
t = n.split('_')[0]
assert t == tool
tool_nodes[tool].add(n)
assert 'truth' in tool_nodes
for edge_tsv in glob.iglob('{}/*.edges.tsv'.format(in_dir)):
tool = edge_tsv.split('/')[-1][:-len('.edges.tsv')]
tool_edges[tool] = dict()
for l in open(edge_tsv):
n1, n2, w = l.rstrip().split('\t')
t1 = n1.split('_')[0]
t2 = n2.split('_')[0]

assert t1 in [tool, 'truth']
assert t2 == tool
assert n1 in tool_nodes[tool] or tool_nodes['truth']
assert n2 in tool_nodes[tool]
k = (n1, n2)
assert not k in tool_edges[tool], (k, tool, w, tool_edges[tool])
tool_edges[tool][k] = float(w)
# print(tool[:7], len(tool_edges[tool]), len(tool_nodes[tool]), sep='\t' )
for tool in tool_edges:
if tool == 'truth':
continue
for k, v in tool_edges['truth'].items():
tool_edges[tool][k] = v
tool_edges.pop('truth')
return tool_nodes, tool_edges


def build_graph(X_nodes, Y_nodes, edges, threshold):
G = nx.Graph()
G.add_nodes_from(X_nodes, bipartite=BIP_TRUTH)
G.add_nodes_from(Y_nodes, bipartite=BIP_TOOL)
G.add_edges_from([(u, v) for (u, v), w in edges.items() if w >= threshold])
return G


def get_stats(G):
comp_stats = dict(
count=0,
TP=0,
FN=0,
FP=0,
LTP=0,
LFX=0,
)
isof_stats = dict(
truth_count=0,
tool_count=0,
TP_truth=0,
TP_tool=0,
FP=0,
FN=0,
LTP_truth=0,
LTP_tool=0,
LFP=0,
LFN=0,
)
for component in nx.connected_components(G):
Gc = G.subgraph(component)
is_clique = (Gc.size() == len(component)*(len(component)-1)/2)
N = sum(1 for _, bip in Gc.nodes(data='bipartite')
if bip == BIP_TRUTH) # number of truth isoforms
M = sum(1 for _, bip in Gc.nodes(data='bipartite')
if bip == BIP_TOOL) # number of tool isoforms
assert N + M == len(component)

kind = None
if is_clique and N > 0 and M > 0:
kind = 'SM' # clique with both tool and truth isoforms
elif N == 0 or M == 0:
kind = 'SX' # only truth or only tool isoforms
else:
matched_req = {
BIP_TRUTH: M, # Each tool isoform node must connect all N truth nodes
BIP_TOOL: N, # Each truth isoform node must connect all M tool nodes
}
match_flag = all( # check if all nodes have full oppositional matching
sum( # count the number of outgoing edges to nodes of the opposite type of v
1 for (_, u) in Gc.edges(v) if Gc.nodes[u]['bipartite'] != v_bip
) == matched_req[v_bip] for v, v_bip in Gc.nodes(data='bipartite')
)
if match_flag:
kind = 'WM'
else:
kind = 'WX'
comp_stats['count'] += 1
isof_stats['truth_count'] += N
isof_stats['tool_count'] += M
if kind == 'SM':
comp_stats['TP'] += 1
isof_stats['TP_truth'] += N
isof_stats['TP_tool'] += M
elif kind == 'WM':
comp_stats['LTP'] += 1
isof_stats['LTP_truth'] += N
isof_stats['LTP_tool'] += M
elif kind == 'WX':
comp_stats['LFX'] += 1
isof_stats['LFN'] += N
isof_stats['LFP'] += M
elif kind == 'SX':
comp_stats['FN'] += N > 0
comp_stats['FP'] += M > 0
isof_stats['FN'] += N
isof_stats['FP'] += M

return comp_stats, isof_stats


def plot_components(stats, tools, tool_labels, threshold, out_dir):
fig, axes = plt.subplots(2, 1, figsize=(25, 10), sharex=True)
fig.suptitle('Connected components at alignment threshold={}'.format(0.97))
group_labels = [
('count', 'All CCs'),
('TP', 'True positive\n(cliques with GTIs and PIs)'),
('FN', 'False postive\n(CCs with only PIs)'),
('FP', 'False negative\n(CCs with only GTIs)'),
('LTP', 'Likely true positive\n(CCs with PIs matching all GTIs)'),
('LFX', 'Likely false postive/negative\n(CCs with PIs not matching all GTIs)'),
]
x = np.arange(len(group_labels)) # the label locations
width = 1/(2+len(tools)) # the width of the bars
axes[0].set_xticks(x+width*(len(tools)-1)/2)
axes[0].set_xticklabels([l for _, l in group_labels])
for ax, kind in zip(axes, ['abso', 'norm']):
if kind == 'abso':
ax.set_ylabel('# of connected components (CCs)')
ax.set_ylim(0, max(stats[t][threshold]['count']
for t in tools)*1.2)
else:
ax.set_ylim(0.0, 1.2)
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
ax.set_ylabel(
'%% of each tool\'s graph connected components (CCs)')
for idx, tool in enumerate(tools):
if kind == 'abso':
data = [stats[tool][threshold][k] for k, _ in group_labels]
else:
data = [stats[tool][threshold][k]/stats[tool]
[threshold]['count'] for k, _ in group_labels]
rects = ax.bar(x + idx*width, data, width, label=tool_labels[tool])
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(height) if kind == 'abso' else '{:.1%}'.format(height),
rotation=60,
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3),
textcoords="offset points",
ha='center', va='bottom')
fig.legend(axes[-1].get_legend_handles_labels()[1], loc='right')
fig.savefig('{}/comp_{}.png'.format(out_dir, threshold))

def sort_tool_names(tools):
sorted_tools = list()
tool_to_name = dict()
if 'isoform' in tools:
sorted_tools.append('isoform')
tool_to_name['isoform'] = 'Isoform alignment'
if 'genome' in tools:
sorted_tools.append('genome')
tool_to_name['genome'] = 'Genome alignment'
if 'segment' in tools:
sorted_tools.append('segment')
tool_to_name['segment'] = 'Freddie segmentation'
if 'freddie' in tools:
sorted_tools.append('freddie')
tool_to_name['freddie'] = 'Freddie'
if 'stringtie' in tools:
sorted_tools.append('stringtie')
tool_to_name['stringtie'] = 'StringTie2'
for t in sorted(tools, reverse=True):
if t.startswith('flair'):
sorted_tools.append(t)
tool_to_name[t] = 'FLAIR ({:.0%})'.format(float(t[6:]))
for t in sorted(tools, reverse=True):
if not t in sorted_tools:
sorted_tools.append(t)
return sorted_tools,tool_to_name

def main():
args = parse_args()
nodes, edges = read_nodes_and_edges(args.input_dir)
tools,tool_labels = sort_tool_names(edges.keys())
comp_stats = dict()
isof_stats = dict()
for tool in tools:
comp_stats[tool] = dict()
isof_stats[tool] = dict()
for threshold in args.thresholds:
G = build_graph(
X_nodes=nodes['truth'],
Y_nodes=nodes[tool],
edges=edges[tool],
threshold=threshold,
)
cur_comp_stats, cur_isof_stats = get_stats(G)
comp_stats[tool][threshold] = cur_comp_stats
isof_stats[tool][threshold] = cur_isof_stats
plot_components(
stats=comp_stats,
tools=tools,
tool_labels=tool_labels,
threshold=0.97,
out_dir=args.out_dir,
)


if __name__ == "__main__":
main()