Skip to content

Commit

Permalink
working reports
Browse files Browse the repository at this point in the history
  • Loading branch information
AggressiveHayBale committed Jan 31, 2023
1 parent 2a93a2d commit 0b6ce07
Show file tree
Hide file tree
Showing 12 changed files with 257 additions and 90 deletions.
186 changes: 155 additions & 31 deletions bin/R_comparison.R
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,34 +1,83 @@
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)

library(gggenomes)
library(ampir)
###########
#Functions#
###########

##Ampir-read faa
read_faa<-function (file = NULL)
{
faa_lines <- readLines(file)
seq_name_index <- grep(">", faa_lines)
seq_name <- gsub(">", "", faa_lines[seq_name_index])
seq_aa_start_index <- seq_name_index + 1
seq_aa_end_index <- c(seq_name_index, length(faa_lines) +
1)[-1] - 1
seq_aa <- rep(NA, length(seq_name_index))
for (i in seq_along(seq_name_index)) {
seq_aa_start <- seq_aa_start_index[i]
seq_aa_end <- seq_aa_end_index[i]
seq_aa[i] <- gsub("[[:space:]]", "", paste(faa_lines[seq_aa_start:seq_aa_end],
collapse = ""))
}
data.frame(seq_name, seq_aa, stringsAsFactors = FALSE)
}

###########
#Data load#
###########

library(stringr)
library(readr)
library(dplyr)
library(tidyr)


id <- args[1]

prokka_gbk<- args[1]
prokka_faa<- args[2]
prokka_gbk<- args[3]

bakta_gbff<- args[3]
bakta_faa<- args[4]
bakta_gbff<- args[5]

pgap_gff<- args[5]
pgap_faa<- args[6]

eggnog_faa<- args[6]
eggnog_gff<- args[7]
eggnog_faa<- args[8]

pgap_faa<- args[8]
pgap_gff<- args[9]

fasta_type <- args[10]



########
#Prokka#
########

# File read
annotation <- read_gbk(prokka_gbk)
annotation <- suppressMessages(read_delim(prokka_gbk,
delim = "\t", escape_double = FALSE,
col_names = FALSE, comment = c("#"), trim_ws = TRUE))
fasta_data <- read_faa(prokka_faa)
# Merging

#Cleanup
annotation <- invisible(na.omit(annotation))
annotation<- annotation[!(annotation$X3=="region" | annotation$X3=="gene"),]
annotation <- separate(annotation, X9, c(NA, "ID"), remove = FALSE, "ID=")
annotation$ID <- gsub(";.*","",annotation$ID)
annotation$product <- str_extract(annotation$X9,pattern = "product=(.*?;)")
annotation$product <- gsub("product=", "", annotation$product)
annotation$product <- gsub(";", "", annotation$product)
annotation$product[annotation$product==""] <- NA

fasta_data$seq_name <- sub(" .*", "", fasta_data$seq_name )
annotation_merged <- merge(annotation,fasta_data, by.x= "locus_tag", by.y = "seq_name", all=TRUE)
# Merging
annotation_merged <- merge(annotation,fasta_data, by.x= "ID", by.y = "seq_name", all=TRUE)
###tRNA and rRNA does not have sequences - only CDS

# Cleanup
annotation_merged<- annotation_merged[!(annotation_merged$type=="region" | annotation_merged$type=="gene"),]
colnames(annotation_merged)[colnames(annotation_merged) == 'seq_aa']<- c("sequence")

prokka<- annotation_merged
Expand All @@ -38,13 +87,26 @@ prokka<- annotation_merged
########

# File read
annotation <- read_gbk(bakta_gbff)
annotation <- suppressMessages(read_delim(bakta_gbff,
delim = "\t", escape_double = FALSE,
col_names = FALSE, comment = c("#"), trim_ws = TRUE))
fasta_data <- read_faa(prokka_faa)


#Cleanup
annotation <- invisible(na.omit(annotation))
annotation<- annotation[!(annotation$X3=="region" | annotation$X3=="gene"),]
annotation <- separate(annotation, X9, c(NA, "ID"), remove = FALSE, "ID=")
annotation$ID <- gsub(";.*","",annotation$ID)
annotation$product <- str_extract(annotation$X9,pattern = "Name=(.*?;)")
annotation$product <- gsub("Name=", "", annotation$product)
annotation$product <- gsub(";", "", annotation$product)
annotation$product[annotation$product==""] <- NA

# Merging
fasta_data$seq_name <- sub(" .*", "", fasta_data$seq_name )
annotation_merged <- merge(annotation,fasta_data, by.x= "locus_tag", by.y = "seq_name", all=TRUE)
annotation_merged <- merge(annotation,fasta_data, by.x= "ID", by.y = "seq_name", all=TRUE)
# Cleanup
annotation_merged<- annotation_merged[!(annotation_merged$type=="region" | annotation_merged$type=="gene"),]
colnames(annotation_merged)[colnames(annotation_merged) == 'seq_aa']<- c("sequence")

bakta <- annotation_merged
Expand All @@ -54,14 +116,26 @@ bakta <- annotation_merged
#PGAP#
######

annotation <- read_gbk(pgap_gff)
annotation <- suppressMessages(read_delim(pgap_gff,
delim = "\t", escape_double = FALSE,
col_names = FALSE, comment = c("#"), trim_ws = TRUE))
fasta_data <- read_faa(pgap_faa)

annotation<- annotation[!(annotation$X3=="region" | annotation$X3=="gene"),]

annotation <- separate(annotation, X9, c(NA, "ID"), remove = FALSE, "ID=cds-")
annotation$ID <- gsub(";.*","",annotation$ID)

annotation$product <- str_extract(annotation$X9,pattern = "product=(.*?;)")
annotation$product <- gsub("product=", "", annotation$product)
annotation$product <- gsub(";", "", annotation$product)

# Merging
fasta_data$seq_name <- sub(" .*", "", fasta_data$seq_name )
fasta_data$seq_name <- sapply(str_split(fasta_data$seq_name,"\\|"), function(x) (x[3]))
annotation_merged <- merge(annotation,fasta_data, by.x= "locus_tag", by.y = "seq_name", all=TRUE)
annotation_merged <- merge(annotation,fasta_data, by.x= "ID", by.y = "seq_name", all=TRUE)
# Cleanup
annotation_merged<- annotation_merged[!(annotation_merged$type=="region" | annotation_merged$type=="gene"),]

colnames(annotation_merged)[colnames(annotation_merged) == 'seq_aa']<- c("sequence")

pgap<-annotation_merged
Expand All @@ -77,12 +151,18 @@ annotation <- suppressMessages(read_delim(eggnog_gff,
fasta_data <- read_faa(eggnog_faa)
# Merging
fasta_data$seq_name <- sapply(str_split(fasta_data$seq_name," "), function(x) (x[1]))
annotation_merged <- merge(annotation,fasta_data, by.x= "X1", by.y = "seq_name", all=TRUE)
annotation_merged <- merge(annotation,fasta_data, by=0, all=TRUE)
# Cleanup
colnames(annotation_merged)[colnames(annotation_merged) == 'seq_aa']<- c("sequence")

#Remove star from end of sequence
annotation_merged$sequence = substring(annotation_merged$sequence,1, nchar(annotation_merged$sequence)-1)

em_Preferred_name <- str_extract(annotation_merged$X9,pattern = "em_Preferred_name=(.*?;)")
em_Preferred_name <- gsub("em_Preferred_name=", "", em_Preferred_name)
em_Preferred_name <- gsub(";", "", em_Preferred_name)
em_Preferred_name[em_Preferred_name==""] <- NA
annotation_merged$product<- em_Preferred_name
eggnog<-annotation_merged

##########
Expand All @@ -96,10 +176,19 @@ pgap_gene_ct <- nrow(pgap)
eggnog_gene_ct <- nrow(eggnog)

#Hypothetical proteins
nrow(prokka[prokka$product=="hypothetical protein",])
nrow(bakta[bakta$product=="hypothetical protein",])
nrow(pgap[pgap$product=="hypothetical protein",])
sum(is.na(eggnog$X8))+nrow(eggnog[eggnog$X8=="-" & eggnog$X9=="-",]) + sum(str_detect(na.omit(eggnog$X8), "unknown"))
prokka_hyps <- nrow(prokka[prokka$product=="hypothetical protein",])
bakta_hyps <- nrow(bakta[bakta$product=="hypothetical protein",])
pgap_hyps <- nrow(pgap[pgap$product=="hypothetical protein",])
eggnog_hyps <- nrow(pgap[pgap$product==NA,])
#eggnog_hyps<- sum(is.na(eggnog$X8))+nrow(eggnog[eggnog$X8=="-" & eggnog$X9=="-",]) + sum(str_detect(na.omit(eggnog$X8), "unknown"))

#Percent of hyps
prokka_perc_hyps <- (prokka_hyps/prokka_gene_ct)*100
bakta_perc_hyps <- (bakta_hyps/bakta_gene_ct)*100
pgap_perc_hyps <- (pgap_hyps/pgap_gene_ct)*100
eggnog_perc_hyps<- (eggnog_hyps/eggnog_gene_ct)*100



##Check what could be hyps
#library(wordcloud)
Expand All @@ -112,9 +201,31 @@ sum(is.na(eggnog$X8))+nrow(eggnog[eggnog$X8=="-" & eggnog$X9=="-",]) + sum(str_d
#wordcloud(words = dat$word, freq = dat$freq,colors=brewer.pal(8, "Dark2"))

#Type of sequence
prokka_type <-table(prokka$type)
bakta_type <- table(bakta$type)
pgap_type <- table(pgap$type)
prokka_type <-as.list(table(prokka$X3))
bakta_type <- as.list(table(bakta$X3))
pgap_type <- as.list(table(pgap$X3))

prokka_type <-as.list(table(prokka$X3))
bakta_type <- as.list(table(bakta$X3))
pgap_type <- as.list(table(pgap$X3))

prokka_CDS<-ifelse(is.null(prokka_type$CDS),0,prokka_type$CDS )
bakta_CDS<- ifelse(is.null(bakta_type$CDS),0,bakta_type$CDS )
pgap_CDS<- ifelse(is.null(pgap_type$CDS),0,pgap_type$CDS )

prokka_rrna<- ifelse(is.null(prokka_type$rRNA),0,prokka_type$rRNA )
bakta_rrna<-ifelse(is.null(bakta_type$rRNA),0,bakta_type$rRNA )
pgap_rrna<- ifelse(is.null(pgap_type$rRNA),0,pgap_type$rRNA )


prokka_trna<-ifelse(is.null(prokka_type$tRNA),0,pgap_type$tRNA )
bakta_trna<-ifelse(is.null(bakta_type$tRNA),0,bakta_type$tRNA )
pgap_trna<-ifelse(is.null(pgap_type$tRNA),0,pgap_type$tRNA )


prokka_tmrna<-ifelse(is.null(prokka_type$tmRNA),0,pgap_type$tmRNA )
bakta_tmrna<-ifelse(is.null(bakta_type$tmRNA),0,bakta_type$tmRNA )
pgap_tmrna<-ifelse(is.null(pgap_type$tmRNA),0,pgap_type$tmRNA )

#Sequence

Expand All @@ -140,8 +251,21 @@ pgap_eggnog_intersect<-length(intersect(na.omit(pgap$name),na.omit(eggnog$name))


#Create summary





col_labels_genecount<-c(c("Prokka gene count","Bakta gene count","PGAP gene count","EGGnog gene count"))
col_labels_hyps<-c(c("Prokka hypothetical genes","Bakta hypothetical genes","PGAP hypothetical genes","EGGnog hypothetical genes"))
col_labels_perc_hyps<-c(c("Prokka hypothetical genes(%)","Bakta hypothetical genes(%)","PGAP hypothetical genes(%)","EGGnog hypothetical genes(%)"))
col_labels_CDS<- c(c("Prokka CDS","Bakta CDS","PGAP CDS"))
col_labels_rrna<- c(c("Prokka rRNA","Bakta rRNA","PGAP rRNA"))
col_labels_trna<- c(c("Prokka tRNA","Bakta tRNA","PGAP tRNA"))
col_labels_tmrna<- c(c("Prokka tmRNA","Bakta tmRNA","PGAP tmRNA"))
summary_df<- data.frame(id, fasta_type, prokka_gene_ct,bakta_gene_ct,pgap_gene_ct,eggnog_gene_ct,
prokka_hyps,bakta_hyps,pgap_hyps,eggnog_hyps,
prokka_perc_hyps,bakta_perc_hyps,pgap_perc_hyps,eggnog_perc_hyps,
prokka_CDS,bakta_CDS,pgap_CDS,
prokka_rrna,bakta_rrna,pgap_rrna,
prokka_trna,bakta_trna,pgap_trna,
prokka_tmrna,bakta_tmrna,pgap_tmrna
)
colnames(summary_df) <- c("id","type",col_labels_genecount,col_labels_hyps,col_labels_perc_hyps,col_labels_CDS,col_labels_rrna,col_labels_trna,col_labels_tmrna)

write_csv(summary_df,"comparison.csv")
5 changes: 3 additions & 2 deletions configs/container.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
process {
withLabel: ubuntu { container = 'nanozoo/template:3.8--e13dfeb' }
withLabel: bakta { container = 'nanozoo/bakta:1.2.1--bf38720' }
withLabel: bakta { container = 'nanozoo/bakta:1.6.1--c012822' }
withLabel: prokka { container = 'nanozoo/prokka:1.14.6--c99ff65' }
withLabel: eggnog { container = 'nanozoo/eggnog-mapper:2.1.9--4f2b6c0' }
withLabel: pgap { container = 'ncbi/pgap:2022-10-03.build6384' }
withLabel: pgap { container = 'ncbi/pgap:2022-10-03.build6384' }
withLabel: r { container = 'nanozoo/ggplot2:3.4.0--3367007' }
}
4 changes: 2 additions & 2 deletions configs/local.config
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
process.executor = 'local'

process {
withLabel: ubuntu { cpus = 2 }
withLabel: ubuntu { cpus = params.cores }
withLabel: prokka { cpus = params.cores }
withLabel: bakta { cpus = params.cores }
withLabel: eggnog { cpus = params.cores }
withLabel: pgap { cpus = params.cores }

withLabel: r { cpus = params.cores }
}
3 changes: 2 additions & 1 deletion configs/nodes.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
process {
withLabel: ubuntu { cpus = 2; memory = '2 GB' }
withLabel: ubuntu { cpus = 8; memory = '16 GB' }
withLabel: bakta { cpus = 8; memory = '16 GB' }
withLabel: prokka { cpus = 8; memory = '16 GB' }
withLabel: eggnog { cpus = 8; memory = '16 GB' }
withLabel: pgap { cpus = 8; memory = '16 GB' }
withLabel: r { cpus = 8; memory = '16 GB' }
}
4 changes: 2 additions & 2 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ csv_ch = Channel
**************************/
include {fasta_mod_wf} from './workflows/fasta_mod_wf.nf'
include {annotation_wf} from './workflows/annotation_wf.nf'
include {analysis} from './workflows/analysis_wf.nf'
include {analysis_wf} from './workflows/analysis_wf.nf'
//include {filter_wf} from './workflows/filter_wf.nf'


Expand All @@ -80,7 +80,7 @@ workflow {
combined_fasta_ch=fasta_mod_wf(csv_ch)
//annotation_wf(fasta_mod_wf.out.combined_fasta_ch)
annotation_wf(combined_fasta_ch)
analysis(annotation_wf.out.combined_annotation_ch)
analysis_wf(annotation_wf.out.combined)
}


Expand Down
67 changes: 59 additions & 8 deletions workflows/analysis_wf.nf
Original file line number Diff line number Diff line change
@@ -1,18 +1,69 @@
include { analysis } from './process/analyis.nf'

workflow analysis{
include { analysis} from './process/analysis.nf'
include { analysis as analysis_org} from './process/analysis.nf'
include { analysis as analysis_noise} from './process/analysis.nf'
include { analysis as analysis_split} from './process/analysis.nf'
workflow analysis_wf{
take:
combined_annotation_ch
combined
main:
//OTHER APPROACH
// branched_ch = combined.branch {
// original: it[1][0] == "original"
// noise: it[1][0] == "noise"
// split: it[1][0] == "split"
// }

//WIP filter channel to only contain original, noise, split for analysis
combined_annotation_ch.filter(
// original_grouped = branched_ch.original.groupTuple(by:0).view()
// noise_grouped = branched_ch.noise.groupTuple(by:0)
// split_grouped = branched_ch.split.groupTuple(by:0)
// analysis(original_grouped)
//END OF OTHER APPROACH
//org_ch=combined.filter{ it -> it[1][0][0] == 'noise'}.view()
//gives prokka
//combined.map { it -> tuple (it[1]) }.view()
//gives bakta
//combined.map { it -> tuple (it[2]) }.view()

original=combined.map { it ->
tuple( it[0], it[1].findAll { it.first() == "original" },it[2].findAll { it.first() == "original" },
it[3].findAll { it.first() == "original" },it[4].findAll { it.first() == "original" } )
}

analysis(combined_annotation_ch)
noise=combined.map { it ->
tuple( it[0], it[1].findAll { it.first() == "noise" },it[2].findAll { it.first() == "noise" },
it[3].findAll { it.first() == "noise" },it[4].findAll { it.first() == "noise" } )
}

split=combined.map { it ->
tuple( it[0], it[1].findAll { it.first() == "split" },it[2].findAll { it.first() == "split" },
it[3].findAll { it.first() == "split" },it[4].findAll { it.first() == "split" } )
}
original_proc=original.map { it -> tuple(it[0],
it[1].flatten()[1],it[1].flatten()[2],it[1].flatten()[3],
it[2].flatten()[1],it[2].flatten()[2],it[2].flatten()[3],
it[3].flatten()[1],it[3].flatten()[2],it[3].flatten()[3],
it[4].flatten()[1],it[4].flatten()[2],it[4].flatten()[3])}

noise=noise.map { it -> tuple(it[0],
it[1].flatten()[1],it[1].flatten()[2],it[1].flatten()[3],
it[2].flatten()[1],it[2].flatten()[2],it[2].flatten()[3],
it[3].flatten()[1],it[3].flatten()[2],it[3].flatten()[3],
it[4].flatten()[1],it[4].flatten()[2],it[4].flatten()[3])}

split=split.map { it -> tuple(it[0],
it[1].flatten()[1],it[1].flatten()[2],it[1].flatten()[3],
it[2].flatten()[1],it[2].flatten()[2],it[2].flatten()[3],
it[3].flatten()[1],it[3].flatten()[2],it[3].flatten()[3],
it[4].flatten()[1],it[4].flatten()[2],it[4].flatten()[3])}

report_original=analysis_org(original_proc,'original')
report_noise=analysis_noise(noise,'noise')
report_split=analysis_split(split,'split')

//emit:
report_original.mix(report_noise).mix(report_split).collectFile(name: 'collected_reports.csv', keepHeader: true ,storeDir: '${params.output}/final_report/')

emit:
combined

}

Loading

0 comments on commit 0b6ce07

Please sign in to comment.