-
Notifications
You must be signed in to change notification settings - Fork 3
/
PaperPlots.R
executable file
·640 lines (549 loc) · 24 KB
/
PaperPlots.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
########
# Name:
# PaperPlots.R
#
# Description:
# Generates plots for the paper from the result data. This script functions
# completely independent and does not require the "BenchmarkResultsGenerics.R"
# script.
#
# Important:
# Be sure to adjust the items in the config section before running the script!
########
##################
### Config ###
##################
# Directory storing script input:
# - benchmark_data-ncbi_id.tsv
# - amelie.tsv
# - gado.tsv
# - hiphive.tsv
# - phenix.tsv
# - phenomizer.tsv
# - phenotips.tsv
# - pubcasefinder.tsv
# - vibe.tsv
# - lirical.tsv
baseDir <- "~/Desktop/zenodo_download/"
# Directory to write the output images to (be sure that directoy exists).
imgExportDir <- "~/Desktop/zenodo_download/out/"
##################
### Libraries ###
##################
library(plyr)
library(dplyr)
library(reshape2)
library(ggplot2)
library(ggdendro)
library(cowplot)
library(grid)
library(rcartocolor)
##################
### Functions ###
##################
########
# Name:
# initializeGraphicsDevice
#
# Description:
# A simple wrapper for writing a file to a predetermined graphics device
# that prepends the directory and appends the file extension.
#
# Input:
# fileName - Filename (excluding file extension) to be used for storage.
# width - The width of the device in inches. -> see ?pdf()
# height - The height of the device in inches. -> see ?pdf()
#
# Output:
#
########
initializeGraphicsDevice <- function(fileName, width, height) {
pdf(paste0(imgExportDir, fileName, ".pdf"), width=width, height=height)
}
########
# Name:
# ggSaveCustom
#
# Description:
# A simple wrapper for writing a file using ggsave that prepends the directory
# and appends the file extension.
#
# Input:
# fileName - Filename (excluding file extension) to be used for storage.
# width - The width of the device in inches. -> see ?ggsave()
# height - The height of the device in inches. -> see ?ggsave()
#
# Output:
#
########
ggSaveCustom <- function(fileName, width, height) {
ggsave(paste0(imgExportDir, fileName, ".pdf"), width=width, height=height)
}
ggSaveCustomWithPlot <- function(fileName, width, height, plot) {
ggsave(paste0(imgExportDir, fileName, ".pdf"), plot=plot, width=width, height=height)
}
########
# Name:
# readResultFile
#
# Description:
# Reads in a single file containing benchmarking results.
#
# Input:
# filePath - The path to the file to be loaded.
#
# Output:
# A table from the result data. Though as the input is expected to only contain
# 2 columns and one of these is used as row names, type becomes a list.
########
readResultFile <- function(filePath) {
read.table(paste0(baseDir, filePath),
header=T, sep="\t", colClasses=c("character"),
row.names=1)
}
########
# Name:
# resultsPositionCalculator
#
# Description:
# For each row in benchmarkData (containing an LOVD and a gene), looks at the
# benchmarkResults to determine the gene positions.
#
# Input:
# benchmarkData - The data on which the benchmark is based upon. Should at least
# contain a column "gene" containing the gene to be found
# (determening the gene position) and "lovd" to determine the ID
# of which phenotype-set was used.
# benchmarkResults - All the output from a benchmark. Each row contains the LOVD
# as row name and the ordered genes as a column named "genes"
# (these genes are comma-separated within this single field).
#
# Output:
#
########
resultsPositionCalculator <- function(benchmarkData, benchmarkResults) {
apply(benchmarkData, 1, singleResultPositionCalculator,
benchmarkResults=benchmarkResults)
}
########
# Name:
# singleResultPositionCalculator
#
# Description:
# Processes a single row of the benchmarkData as defined in the function
# resultsPositionCalculator. For this single benchmark slice, it looks up the
# corresponding LOVD in benchmarkResults and then defines the gene position.
#
# Input:
# benchmarkDataRow - A list (row from benchmarkData given by
# resultsPositionCalculator) containing an LOVD and a gene.
# benchmarkResults - All the output from a benchmark. Each row contains the LOVD
# as row name and the ordered genes as a column named "genes"
# (these genes are comma-separated within this single field).
#
# Output:
#
########
singleResultPositionCalculator <- function(benchmarkDataRow, benchmarkResults) {
match(benchmarkDataRow["gene"],
strsplit(benchmarkResults[benchmarkDataRow["lovd"],"suggested_genes"], ",")[[1]])
}
########
# Name:
# calculateTotalGenesFound
#
# Description:
# Calculates the total number of genes per item for the input list benchmarkResults.
#
# Input:
# benchmarkResults - All the output from a benchmark. Each row contains the LOVD
# as row name and the ordered genes as a column named "genes"
# (these genes are comma-separated within this single field).
#
# Output:
# An integer vector containing the total number of genes for each (named) item
# (in the row order of benchmarkResults).
########
calculateTotalGenesFound <- function(benchmarkResults) {
unlist(lapply(sapply(benchmarkResults[,"suggested_genes"], strsplit, split=","), length), use.names=FALSE)
}
########
# Name:
# sortRows
#
# Description:
# Orders the rows based on their row name.
#
# Input:
# benchmarkResults - All the output from a benchmark. Each row contains the LOVD
# as row name and the ordered genes as a column named "genes"
# (these genes are comma-separated within this single field).
#
# Output:
# benchmarkResults with the rows ordered on their name.
########
sortRows <- function(benchmarkResults) {
benchmarkResults[order(as.numeric(rownames(benchmarkResults))), , drop=FALSE]
}
########
# Name:
# ggplotLegend
#
# Description:
# Retrieves legend from a ggplot.
# Based on https://stackoverflow.com/a/12041779
#
# Input:
# ggplot - The plot to retrieve the legend from.
#
# Output:
# The legend from the plot.
########
ggplotLegend <- function(ggplot){
tmp <- ggplot_gtable(ggplot_build(ggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
##################
### Code ###
##################
###
### Basic data preperations for further usage.
###
# Load benchmark input data (with ncbi gene id's).
benchmarkData <- read.table(paste0(baseDir,"benchmark_data-ncbi_id.tsv"), header=T,
sep="\t",colClasses=c(rep("character", 3),
"factor", "character"))
# load NCBI symbols of CGD genes and converts to vector.
cgd <- as.character(scan(paste0(baseDir, "cgd-ids_2020-02-04.csv"), sep=","))
# Loads benchmark results and sorts it so that row order is identical.
amelie <- sortRows(readResultFile("amelie.tsv"))
gado <- sortRows(readResultFile("gado.tsv"))
phenomizer <- sortRows(readResultFile("phenomizer.tsv"))
phenotips <- sortRows(readResultFile("phenotips.tsv"))
hiphive <- sortRows(readResultFile("hiphive.tsv"))
phenix <- sortRows(readResultFile("phenix.tsv"))
pubcf <- sortRows(readResultFile("pubcasefinder.tsv"))
vibe <- sortRows(readResultFile("vibe.tsv"))
lirical <- sortRows(readResultFile("lirical.tsv"))
# Calculate absolute positions. This is done by processing benchmarkData row-by-row
# and therefore the LOVD row order of positionResults is equal to that of benchmarkData.
positionResults <- data.frame("AMELIE"=resultsPositionCalculator(benchmarkData, amelie),
"GADO"=resultsPositionCalculator(benchmarkData, gado),
"Phenomizer"=resultsPositionCalculator(benchmarkData, phenomizer),
"Phenotips"=resultsPositionCalculator(benchmarkData, phenotips),
"VIBE"=resultsPositionCalculator(benchmarkData, vibe),
"hiPHIVE"=resultsPositionCalculator(benchmarkData, hiphive),
"PubCaseF."=resultsPositionCalculator(benchmarkData, pubcf),
"PhenIX"=resultsPositionCalculator(benchmarkData, phenix),
"LIRICAL"=resultsPositionCalculator(benchmarkData, lirical))
# Calculate total number of genes.
totalResults <- data.frame("AMELIE"=calculateTotalGenesFound(amelie),
"GADO"=calculateTotalGenesFound(gado),
"Phenomizer"=calculateTotalGenesFound(phenomizer),
"Phenotips"=calculateTotalGenesFound(phenotips),
"VIBE"=calculateTotalGenesFound(vibe),
"hiPHIVE"=calculateTotalGenesFound(hiphive),
"PubCaseF."=calculateTotalGenesFound(pubcf),
"PhenIX"=calculateTotalGenesFound(phenix),
"LIRICAL"=calculateTotalGenesFound(lirical),
row.names=rownames(amelie))
# Replicates some of the totalResults so that size is equal to positionResults.
totalResults <- totalResults[benchmarkData$lovd,]
# Make sure to use same order as colnames(positionResults)!!!
tools <- c("amelie", "gado", "phenomizer", "phenotips", "vibe", "hiphive", "pubcf", "phenix", "lirical") # Names of variables used as input.
toolNames <- c("AMELIE", "GADO", "Phenomizer", "Phenotips", "VIBE", "hiPHIVE", "PubCaseF.", "PhenIX", "LIRICAL") # Names of the tools within the dataframe.
# Generate splitted genes for all tools: [[tool]][[lovd]]@genes[[1]]
setClass("suggestedGenes", representation(genes="vector"))
toolOutputSplitted <- sapply(tools, function(tool) {
toolData <- get(tool)
sapply(rownames(toolData), function(lovd, toolData) {
new("suggestedGenes", genes=strsplit(toolData[lovd,"suggested_genes"], ","))
}, toolData=toolData)
}, simplify=FALSE)
names(toolOutputSplitted) <- colnames(positionResults)
###
### Data info.
###
# Check whether the rows of all benchmarks are identically ordered.
rownames(amelie) == rownames(gado) &&
rownames(gado) == rownames(phenomizer) &&
rownames(phenomizer) == rownames(phenotips) &&
rownames(phenotips) == rownames(vibe) &&
rownames(vibe) == rownames(hiphive) &&
rownames(hiphive) == rownames(phenix) &&
rownames(phenix) == rownames(pubcf) &&
rownames(pubcf) == rownames(lirical)
# Check whether there is any phenotype set for which no single gene was found.
any(is.na(totalResults))
# Tool colors
toolColors <- carto_pal(9, "Safe")
names(toolColors) <- toolNames
##############################
########## FIGURE 1 ##########
##############################
### Scatterplot with means and missing
# Preperations.
posRelM <- melt(positionResults, id.vars = 0)
totResM <- melt(totalResults, id.vars = 0)
posRelM$total <- totResM$value
colnames(posRelM) <- c("tool", "rank", "total")
posRelM$relative <- posRelM$rank / posRelM$total
levels(posRelM$tool)[match("PubCF.",levels(posRelM$tool))] <- "PubCaseF."
gd <- posRelM %>%
group_by(tool) %>%
summarise(total = mean(total, na.rm=T),
rank = mean(rank, na.rm=T))
toolNaRanks <- aggregate(rank ~ tool, data=posRelM, function(x) {sum(is.na(x))}, na.action = NULL)
gd$NAs <- paste(toolNaRanks$tool, " (", toolNaRanks$rank, " missed)", sep="")
gd$labX <- c(160, 2400, 5, 5, 70, 120, 5, 5, 140)
gd$labY <- c(30000, 30000, 30000, 3000, 1000, 3000, 10000, 1000, 10000)
# Plotting figure.
ggplot() +
geom_point(data = posRelM, aes(x=total, y=rank, color=tool), size = 0.3) + #0.1
geom_point(data = gd, aes(x=total, y=rank, fill=tool), shape = 21, color = "black", stroke = 1, size = 2) +
# legend stuff
geom_point(aes(x=11, y=150), color = "black", size=0.3) +
geom_text(aes(x=11, y=150, label = "= one causal gene"), color = "black", size = 2, hjust = 0, nudge_x = 0.1) +
geom_point(aes(x=11, y=300), shape = 1, color = "black", stroke = 1, size = 2) +
geom_text(aes(x=11, y=300, label="= tool X and Y means"), color="black", hjust = 0, size = 2, nudge_x = 0.1) +
geom_text(aes(x=8.5, y=30, label="Total: 308\ncausal genes"), color="black", hjust = 0, size = 2, nudge_x = 0.1) +
geom_label(data = gd, aes(x = labX, y = labY, label = NAs, fill = tool), color="white", hjust = 0, size = 2, fontface = "bold") +
scale_y_log10(breaks = c(1, 10, 100, 1000, 10000)) + scale_x_log10(breaks = c(1, 10, 100, 1000, 10000, 40000)) +
theme_bw() +
theme(panel.grid = element_blank(), panel.border = element_rect(colour = "black"), axis.ticks = element_line(colour = "black"), legend.position = "none", axis.text = element_text(color = "black")) +
scale_color_manual(values = toolColors) +
scale_fill_manual(values = toolColors) +
labs(x = "Total number of candidate genes returned", y = "Rank of the causal gene")
ggSaveCustom("Figure1", width=4, height=2.5)
# Removes variables specific to this section.
rm(posRelM,totResM,gd,toolNaRanks)
##############################
########## FIGURE 2 ##########
##############################
### Heatmap showing absolute positions from the different tools.
# Preperations.
customBreaks <- c(1, 10, 100, 1000, 10000, 40000)
customColours <- c("#0072B2","#009E73","#F0E442", "#D55E00")
naValue <- 50000
dataToPlot <- positionResults
dataToPlot[is.na(dataToPlot)] <- naValue
xClust <- hclust(dist(t(dataToPlot), method = "euclidean"), method="complete")
yClust <- hclust(dist(dataToPlot, method = "euclidean"), method="complete")
dataToPlot <- cbind(id=rownames(dataToPlot), dataToPlot)
dataToPlot <- melt(dataToPlot, id.vars="id",
variable.name="tool",
value.name="absPosition")
# Renames "PubCaseF." to "PubCF."
dataToPlot$tool <- revalue(dataToPlot$tool, c("PubCaseF."="PubCF."))
# Orders data based on clustering.
dataToPlot$id <- factor(dataToPlot$id,
levels = dataToPlot$id[yClust$order],
ordered = TRUE)
dataToPlot$tool <- factor(dataToPlot$tool,
levels = levels(dataToPlot$tool)[xClust$order],
ordered = TRUE)
# Theme for dendrograms.
dendroTheme <- theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
plot.margin = unit(c(0,0,0,0), "cm"))
# Tool dendogram. Remove "+ dendroTheme" for comparing values with hmPlot.
xClustPlot <- ggdendrogram(xClust) + dendroTheme
# Patient cases dendogram. Remove "+ dendroTheme" for comparing values with hmPlot.
yClustPlot <- ggdendrogram(yClust, rotate=T) + dendroTheme
# Heatmap.
hmPlot <- ggplot(dataToPlot, aes(x=tool, y=id, colour="")) + # colour is to trick ggplot for "not found" in legend
theme_bw() +
theme(legend.box="horizontal",
legend.box.spacing=unit(0, "cm"),
legend.key.size=unit(0.4, "cm"),
legend.text=element_text(size=6),
legend.title=element_text(size=9),
panel.grid=element_blank(),
panel.border=element_blank(),
axis.title = element_text(size = 20),
axis.text.x=element_text(colour="black", margin=margin(b=5)),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(), # Can be commented to compare with yClustPlot
axis.ticks.y=element_blank()) +
labs(x="Gene prioritization tool", y="Causal genes in patient cases",
fill="Causal gene rank\n1=best, 40k=worst",
colour="Missed gene\n(not found)") +
geom_tile(aes(fill=absPosition)) +
scale_fill_gradientn(colours = customColours, trans = "log", limit=c(1,40000), breaks = customBreaks, na.value = "black") +
scale_colour_manual(values=NA, labels = "") + # makes sure aes colour is not visible
scale_y_discrete(expand=c(0,0)) # moves x labels closer
# Filter legend from heatmap and generate plot from it.
hmInfo <- ggplotLegend(hmPlot)
hmInfoPlot <- ggdraw() +
draw_grob(hmInfo$grobs[[2]], 0.25, 0) + # "not found" 0.1,0
draw_grob(hmInfo$grobs[[1]], -0.70, 0) + # color gradient legend
draw_label("X/Y dendrograms by\nhierarchical clustering\non Euclidean distance", x = 0.51, y = 0.17, size = 8)
# Remove legend from original heatmap.
hmPlot <- hmPlot + theme(legend.position = "none")
# Generate image.
initializeGraphicsDevice("Figure2", width=7, height=6)
plot_grid(
xClustPlot, hmInfoPlot,
hmPlot, yClustPlot,
nrow = 2, ncol = 2,
align="hv",
axis="tblr",
rel_widths = c(3,1),
rel_heights = c(1,2.5),
scale=c(0.9,0.95,1,1.08)
)
dev.off()
# Removes variables specific to this section.
rm(customBreaks,customColours,naValue,dataToPlot,xClust,yClust,dendroTheme,xClustPlot,yClustPlot,hmPlot,hmInfo,hmInfoPlot)
##############################
########## FIGURE 3 ##########
##############################
### Analysis to show practical value.
# Defines number of runs.
runs <- 25
# Defines number of spiking genes.
spikingGenes <- 19
# Set seed for pseudo-random numbers for reproducibility.
# Don't forget to reset seed when re-running code!!!
set.seed(0)
# Runs benchmark a number of times.
enrichedScores <- sapply(1:runs, function(x, spikingGenes) {
# Generate sample matrix containing the genes to use for ranking (together with target gene).
cgdSample <- t(apply(benchmarkData, 1, function(case, spikingGenes) {
# Select all CGD genes minus the current causal one.
cgdMinusCurrentGene <- cgd[!cgd %in% case["gene"]]
# Randomly sample 19 other genes.
cgdSample <- sample(cgdMinusCurrentGene, spikingGenes)
}, spikingGenes=spikingGenes))
# Goes through all cases.
return(t(sapply(1:nrow(benchmarkData), function(nRow, benchmarkData, cgdSample, toolOutputSplitted) {
# Defines the target gene.
benchmarkGene <- benchmarkData[nRow,"gene"]
# Defines the full set of genes which need to be ranked (target + cgdSample).
geneSet <- c(benchmarkGene, cgdSample[nRow,])
# Goes through all tools.
sapply(names(toolOutputSplitted), function(toolName, lovd, benchmarkGene, geneSet, toolOutputSplitted) {
# Finds matches for the enriched gene set within the full output for that tool/lovd.
geneSetMatches <- match(geneSet, toolOutputSplitted[[toolName]][[lovd]]@genes[[1]])
# If benchmark gene is not found, returns adjusted score.
if(is.na(geneSetMatches[1])){
#return(mean(c(sum(!is.na(geneSetMatches)), length(geneSet)))) # method 1: missing rank middle of input set minus outsize, realistic
#return(length(geneSet)) # method 2: input set size, pessimistic, plot will spike at very end
#return(NA) # method 3: harsh: missing genes do NOT increase the cumulative hits, some lines go flat
return(sample(sum(!is.na(geneSetMatches)):length(geneSet), 1)) # method 4: missing rank at random position of input set minus outsize, super realistic
}
# Orders the found matches (NA=LAST).
geneSetOrdered <- geneSet[order(geneSetMatches)]
# Returns the found location of the target gene in the enriched gene set.
return(match(benchmarkGene, geneSetOrdered))
}, lovd=benchmarkData[nRow,"lovd"], benchmarkGene=benchmarkGene, geneSet=geneSet, toolOutputSplitted=toolOutputSplitted)
}, benchmarkData=benchmarkData, cgdSample=cgdSample, toolOutputSplitted=toolOutputSplitted)))
}, spikingGenes=spikingGenes, simplify=FALSE)
# Calculate median per tool/case combination.
MedianScores <- matrix(sapply(1:length(enrichedScores[[1]]), function(x) {
median(sapply(enrichedScores, "[[", x))
}), ncol=ncol(positionResults), dimnames=list(1:nrow(benchmarkData), names(toolOutputSplitted)))
# Genes found per cutoff.
foundPerCutoff <- sapply(1:(spikingGenes+1), function(x) {
apply(MedianScores <= x,2,sum, na.rm=TRUE)
})
colnames(foundPerCutoff) <- 1:(spikingGenes+1)
# Plot preperations.
melted <- melt(t(foundPerCutoff))
colnames(melted) <- c("cutoff", "tool", "value")
# Plot figure.
ggplot() +
geom_line(data = melted, aes(x = cutoff, y = value, color = tool), size=1) +
geom_point(data = melted, aes(x = cutoff, y = value, color = tool), size=3) +
scale_color_manual(values = toolColors) +
scale_x_continuous(breaks = seq(1,20,1), limits = c(1,20)) +
scale_y_continuous(breaks = seq(60,300,20)) +
theme(text = element_text(size=20), legend.title=element_blank(), legend.position = c(0.7, 0.45),
panel.background = element_blank(), legend.key = element_blank(), legend.background = element_blank(),
legend.text = element_text(size=12), legend.key.width = unit(2, "cm"), legend.key.height = unit(1, "cm")) +
labs(x = "Gene rank in simulated spiked-in clinical gene sets", y = "Cumul. nr. of causal genes detected")
grid.ls(grid.force())
grid.gedit("key-[0-9]*-1-2", size = unit(8, "mm"))
myPlot <- (grid.grab())
ggSaveCustomWithPlot("Figure3", width=8, height=5, plot=myPlot)
# Removes variables specific to this section.
rm(runs,spikingGenes,enrichedScores,MedianScores,foundPerCutoff,melted)
##############################
########## FIGURE 4 ##########
##############################
uniqPerThr <- data.frame()
for(i in 1:1000)
{
cutoffAbs <- i
withinTopX <- apply(positionResults <= cutoffAbs, 2, which)
uniqueFound <- sapply(1:length(withinTopX), function(x) {
withinTopX[[x]][which(!unlist(withinTopX[x]) %in% unlist(withinTopX[-x]))]
})
names(uniqueFound) <- names(withinTopX)
sapply(uniqueFound, length)
for(name in names(uniqueFound))
{
uniqPerThr <- rbind(uniqPerThr, data.frame(tool = name, topX = i, uniq = length(uniqueFound[[name]])))
}
}
levels(uniqPerThr$tool)[match("PubCF.",levels(uniqPerThr$tool))] <- "PubCaseF."
set.seed(0) # for reproducible jitter
ggplot() +
geom_jitter(data=uniqPerThr, aes(x = topX, y = uniq, color = tool), size=0.1, width = 0, height=0.35) +
geom_segment(aes(x = 0, xend=1000, y=c(-0.5:33), yend=c(-0.5:33)), color="gray50", size=0.1, linetype="solid") +
scale_color_manual(values = toolColors) +
theme(text = element_text(size=16), legend.title=element_blank(), legend.position = c(0.755, 0.6),
legend.background = element_rect(fill="white"), legend.key.height = unit(0.7, "cm")) +
scale_y_continuous(breaks = seq(0,32,2)) +
scale_x_continuous(breaks = seq(0,1000,100), limit=c(0,1000)) +
theme(legend.text=element_text(size=9)) +
guides(colour = guide_legend(override.aes = list(size=6))) +
labs(y = "Causal genes found by only 1 tool", x = "Number of top hits considered in tool output")
ggSaveCustom("Figure4", width=6, height=4)
###
### Gene overrepresentation in VIBE output
###
cutoff <- 10
uniqueGenesInTopOutput <- unique(as.vector(sapply(toolOutputSplitted$VIBE, function(x){x@genes[[1]][1:cutoff]})))
foundPositionsTopGenes <- sapply(uniqueGenesInTopOutput, function(gene, vibeToolOutput) {
sapply(vibeToolOutput, function(toolOutput, gene){
match(gene, toolOutput@genes[[1]])
}, gene=gene)
}, vibeToolOutput=toolOutputSplitted$VIBE)
timesGeneFoundinTop <- apply(foundPositionsTopGenes, 2, function(x) {length(which(x <=10))})
# numbers in Discussion section of manuscript (first 2)
sort(timesGeneFoundinTop, decreasing=T)
# Removes variables specific to this section.
rm(cutoff,uniqueGenesInTopOutput,foundPositionsTopGenes,timesGeneFoundinTop)
###
### Unique found per tool at defined cutoff.
###
# Defines cutoff for showing unique.
cutoffLimit <- 40
# Gives unique for each cutoff per tool.
uniquePerCutoff <- sapply(1:cutoffLimit, function(cutoffLimit, positionResults) {
withinTopX <- apply(positionResults <= as.double(cutoffLimit), 2, which)
return(sapply(1:length(withinTopX), function(x) {
length(withinTopX[[x]][which(!unlist(withinTopX[x]) %in% unlist(withinTopX[-x]))])
}))
}, positionResults=positionResults)
dimnames(uniquePerCutoff) <- list(colnames(positionResults), 1:cutoffLimit)
# Total unique per cutoff.
apply(uniquePerCutoff, 2, sum)
# numbers in Result section of manuscript
uniquePerCutoff[,20]
# Plot figure.
initializeGraphicsDevice("unique_per_cutoff", width=8, height=4)
par(mar=c(5.1, 4.1, 4.1, 5.1))
barplot(uniquePerCutoff, col=toolColors[rownames(uniquePerCutoff)], space=FALSE, border=NA, las=1,
xlab="cutoff", ylab="percentage unique hits")
par(xpd=TRUE) # no clipping for drawing outside plot
legend(41,70,
rownames(uniquePerCutoff), fill=toolColors[rownames(uniquePerCutoff)], ncol=1, cex=0.62)
dev.off()
# Removes variables specific to this section.
rm(cutoffLimit,uniquePerCutoff)