working reports

AggressiveHayBale · Jan 31, 2023 · 0b6ce07 · 0b6ce07
1 parent 2a93a2d
commit 0b6ce07
Show file tree

Hide file tree

Showing 12 changed files with 257 additions and 90 deletions.
diff --git a/bin/R_comparison.R b/bin/R_comparison.R
@@ -1,34 +1,83 @@
 #!/usr/bin/env Rscript
 args = commandArgs(trailingOnly=TRUE)
 
-library(gggenomes)
-library(ampir)
+###########
+#Functions#
+###########
+
+##Ampir-read faa
+read_faa<-function (file = NULL) 
+{
+  faa_lines <- readLines(file)
+  seq_name_index <- grep(">", faa_lines)
+  seq_name <- gsub(">", "", faa_lines[seq_name_index])
+  seq_aa_start_index <- seq_name_index + 1
+  seq_aa_end_index <- c(seq_name_index, length(faa_lines) + 
+                          1)[-1] - 1
+  seq_aa <- rep(NA, length(seq_name_index))
+  for (i in seq_along(seq_name_index)) {
+    seq_aa_start <- seq_aa_start_index[i]
+    seq_aa_end <- seq_aa_end_index[i]
+    seq_aa[i] <- gsub("[[:space:]]", "", paste(faa_lines[seq_aa_start:seq_aa_end], 
+                                               collapse = ""))
+  }
+  data.frame(seq_name, seq_aa, stringsAsFactors = FALSE)
+}
+
+###########
+#Data load#
+###########
+
 library(stringr)
+library(readr)
+library(dplyr)
+library(tidyr)
+
+
+id <- args[1]
 
-prokka_gbk<- args[1]
 prokka_faa<- args[2]
+prokka_gbk<- args[3]
 
-bakta_gbff<- args[3]
 bakta_faa<- args[4]
+bakta_gbff<- args[5]
 
-pgap_gff<- args[5]
-pgap_faa<- args[6]
-
+eggnog_faa<- args[6]
 eggnog_gff<- args[7]
-eggnog_faa<- args[8]
 
+pgap_faa<- args[8]
+pgap_gff<- args[9]
+
+fasta_type <- args[10]
+
+
+
 ########
 #Prokka#
 ########
 
 # File read
-annotation <- read_gbk(prokka_gbk)
+annotation <- suppressMessages(read_delim(prokka_gbk,
+                                          delim = "\t", escape_double = FALSE,
+                                          col_names = FALSE, comment = c("#"), trim_ws = TRUE))
 fasta_data <- read_faa(prokka_faa)
-# Merging
+
+#Cleanup
+annotation <- invisible(na.omit(annotation))
+annotation<- annotation[!(annotation$X3=="region" | annotation$X3=="gene"),]
+annotation <- separate(annotation, X9, c(NA, "ID"), remove = FALSE, "ID=")
+annotation$ID <- gsub(";.*","",annotation$ID)
+annotation$product <- str_extract(annotation$X9,pattern = "product=(.*?;)") 
+annotation$product <- gsub("product=", "", annotation$product)
+annotation$product <- gsub(";", "", annotation$product)
+annotation$product[annotation$product==""] <- NA
+
 fasta_data$seq_name <- sub(" .*", "", fasta_data$seq_name )
-annotation_merged <- merge(annotation,fasta_data, by.x= "locus_tag", by.y = "seq_name", all=TRUE)
+# Merging
+annotation_merged <- merge(annotation,fasta_data, by.x= "ID", by.y = "seq_name", all=TRUE)
+###tRNA and rRNA does not have sequences - only CDS
+
 # Cleanup
-annotation_merged<- annotation_merged[!(annotation_merged$type=="region" | annotation_merged$type=="gene"),]
 colnames(annotation_merged)[colnames(annotation_merged) == 'seq_aa']<- c("sequence")
 
 prokka<- annotation_merged
@@ -38,13 +87,26 @@ prokka<- annotation_merged
 ########
 
 # File read
-annotation <- read_gbk(bakta_gbff)
+annotation <- suppressMessages(read_delim(bakta_gbff,
+                                          delim = "\t", escape_double = FALSE,
+                                          col_names = FALSE, comment = c("#"), trim_ws = TRUE))
 fasta_data <- read_faa(prokka_faa)
+
+
+#Cleanup
+annotation <- invisible(na.omit(annotation))
+annotation<- annotation[!(annotation$X3=="region" | annotation$X3=="gene"),]
+annotation <- separate(annotation, X9, c(NA, "ID"), remove = FALSE, "ID=")
+annotation$ID <- gsub(";.*","",annotation$ID)
+annotation$product <- str_extract(annotation$X9,pattern = "Name=(.*?;)") 
+annotation$product <- gsub("Name=", "", annotation$product)
+annotation$product <- gsub(";", "", annotation$product)
+annotation$product[annotation$product==""] <- NA
+
 # Merging
 fasta_data$seq_name <- sub(" .*", "", fasta_data$seq_name )
-annotation_merged <- merge(annotation,fasta_data, by.x= "locus_tag", by.y = "seq_name", all=TRUE)
+annotation_merged <- merge(annotation,fasta_data, by.x= "ID", by.y = "seq_name", all=TRUE)
 # Cleanup
-annotation_merged<- annotation_merged[!(annotation_merged$type=="region" | annotation_merged$type=="gene"),]
 colnames(annotation_merged)[colnames(annotation_merged) == 'seq_aa']<- c("sequence")
 
 bakta <- annotation_merged
@@ -54,14 +116,26 @@ bakta <- annotation_merged
 #PGAP#
 ######
 
-annotation <- read_gbk(pgap_gff)
+annotation <- suppressMessages(read_delim(pgap_gff,
+                                          delim = "\t", escape_double = FALSE,
+                                          col_names = FALSE, comment = c("#"), trim_ws = TRUE))
 fasta_data <- read_faa(pgap_faa)
+
+annotation<- annotation[!(annotation$X3=="region" | annotation$X3=="gene"),]
+
+annotation <- separate(annotation, X9, c(NA, "ID"), remove = FALSE, "ID=cds-")
+annotation$ID <- gsub(";.*","",annotation$ID)
+
+annotation$product <- str_extract(annotation$X9,pattern = "product=(.*?;)") 
+annotation$product <- gsub("product=", "", annotation$product)
+annotation$product <- gsub(";", "", annotation$product)
+
 # Merging
 fasta_data$seq_name <- sub(" .*", "", fasta_data$seq_name )
 fasta_data$seq_name <- sapply(str_split(fasta_data$seq_name,"\\|"), function(x) (x[3]))
-annotation_merged <- merge(annotation,fasta_data, by.x= "locus_tag", by.y = "seq_name", all=TRUE)
+annotation_merged <- merge(annotation,fasta_data, by.x= "ID", by.y = "seq_name", all=TRUE)
 # Cleanup
-annotation_merged<- annotation_merged[!(annotation_merged$type=="region" | annotation_merged$type=="gene"),]
+
 colnames(annotation_merged)[colnames(annotation_merged) == 'seq_aa']<- c("sequence")
 
 pgap<-annotation_merged
@@ -77,12 +151,18 @@ annotation <- suppressMessages(read_delim(eggnog_gff,
 fasta_data <- read_faa(eggnog_faa)
 # Merging
 fasta_data$seq_name <- sapply(str_split(fasta_data$seq_name," "), function(x) (x[1]))
-annotation_merged <- merge(annotation,fasta_data, by.x= "X1", by.y = "seq_name", all=TRUE)
+annotation_merged <- merge(annotation,fasta_data, by=0, all=TRUE)
 # Cleanup
 colnames(annotation_merged)[colnames(annotation_merged) == 'seq_aa']<- c("sequence")
 
 #Remove star from end of sequence 
 annotation_merged$sequence = substring(annotation_merged$sequence,1, nchar(annotation_merged$sequence)-1)
+
+em_Preferred_name <- str_extract(annotation_merged$X9,pattern = "em_Preferred_name=(.*?;)") 
+em_Preferred_name <- gsub("em_Preferred_name=", "", em_Preferred_name)
+em_Preferred_name <- gsub(";", "", em_Preferred_name)
+em_Preferred_name[em_Preferred_name==""] <- NA
+annotation_merged$product<- em_Preferred_name
 eggnog<-annotation_merged
 
 ##########
@@ -96,10 +176,19 @@ pgap_gene_ct <- nrow(pgap)
 eggnog_gene_ct  <- nrow(eggnog)
 
 #Hypothetical proteins
-nrow(prokka[prokka$product=="hypothetical protein",])
-nrow(bakta[bakta$product=="hypothetical protein",])
-nrow(pgap[pgap$product=="hypothetical protein",])
-sum(is.na(eggnog$X8))+nrow(eggnog[eggnog$X8=="-" & eggnog$X9=="-",]) + sum(str_detect(na.omit(eggnog$X8), "unknown"))
+prokka_hyps <- nrow(prokka[prokka$product=="hypothetical protein",])
+bakta_hyps <- nrow(bakta[bakta$product=="hypothetical protein",])
+pgap_hyps <- nrow(pgap[pgap$product=="hypothetical protein",])
+eggnog_hyps <- nrow(pgap[pgap$product==NA,])
+#eggnog_hyps<- sum(is.na(eggnog$X8))+nrow(eggnog[eggnog$X8=="-" & eggnog$X9=="-",]) + sum(str_detect(na.omit(eggnog$X8), "unknown"))
+
+#Percent of hyps 
+prokka_perc_hyps <- (prokka_hyps/prokka_gene_ct)*100
+bakta_perc_hyps <- (bakta_hyps/bakta_gene_ct)*100
+pgap_perc_hyps <- (pgap_hyps/pgap_gene_ct)*100
+eggnog_perc_hyps<- (eggnog_hyps/eggnog_gene_ct)*100
+
+
 
 ##Check what could be hyps
 #library(wordcloud)
@@ -112,9 +201,31 @@ sum(is.na(eggnog$X8))+nrow(eggnog[eggnog$X8=="-" & eggnog$X9=="-",]) + sum(str_d
 #wordcloud(words = dat$word, freq = dat$freq,colors=brewer.pal(8, "Dark2"))
 
 #Type of sequence
-prokka_type <-table(prokka$type)
-bakta_type <- table(bakta$type)
-pgap_type <- table(pgap$type)
+prokka_type <-as.list(table(prokka$X3))
+bakta_type <- as.list(table(bakta$X3))
+pgap_type <- as.list(table(pgap$X3))
+
+prokka_type <-as.list(table(prokka$X3))
+bakta_type <- as.list(table(bakta$X3))
+pgap_type <- as.list(table(pgap$X3))
+
+prokka_CDS<-ifelse(is.null(prokka_type$CDS),0,prokka_type$CDS )
+bakta_CDS<- ifelse(is.null(bakta_type$CDS),0,bakta_type$CDS )
+pgap_CDS<- ifelse(is.null(pgap_type$CDS),0,pgap_type$CDS )
+
+prokka_rrna<- ifelse(is.null(prokka_type$rRNA),0,prokka_type$rRNA )
+bakta_rrna<-ifelse(is.null(bakta_type$rRNA),0,bakta_type$rRNA )
+pgap_rrna<- ifelse(is.null(pgap_type$rRNA),0,pgap_type$rRNA )
+
+
+prokka_trna<-ifelse(is.null(prokka_type$tRNA),0,pgap_type$tRNA )
+bakta_trna<-ifelse(is.null(bakta_type$tRNA),0,bakta_type$tRNA )
+pgap_trna<-ifelse(is.null(pgap_type$tRNA),0,pgap_type$tRNA )
+
+
+prokka_tmrna<-ifelse(is.null(prokka_type$tmRNA),0,pgap_type$tmRNA )
+bakta_tmrna<-ifelse(is.null(bakta_type$tmRNA),0,bakta_type$tmRNA )
+pgap_tmrna<-ifelse(is.null(pgap_type$tmRNA),0,pgap_type$tmRNA )
 
 #Sequence 
 
@@ -140,8 +251,21 @@ pgap_eggnog_intersect<-length(intersect(na.omit(pgap$name),na.omit(eggnog$name))
 
 
 #Create summary
-
-
-
-
-
+col_labels_genecount<-c(c("Prokka gene count","Bakta gene count","PGAP gene count","EGGnog gene count"))
+col_labels_hyps<-c(c("Prokka hypothetical genes","Bakta hypothetical genes","PGAP hypothetical genes","EGGnog hypothetical genes"))
+col_labels_perc_hyps<-c(c("Prokka hypothetical genes(%)","Bakta hypothetical genes(%)","PGAP hypothetical genes(%)","EGGnog hypothetical genes(%)"))
+col_labels_CDS<- c(c("Prokka CDS","Bakta CDS","PGAP CDS"))
+col_labels_rrna<- c(c("Prokka rRNA","Bakta rRNA","PGAP rRNA"))
+col_labels_trna<- c(c("Prokka tRNA","Bakta tRNA","PGAP tRNA"))
+col_labels_tmrna<- c(c("Prokka tmRNA","Bakta tmRNA","PGAP tmRNA"))
+summary_df<- data.frame(id, fasta_type, prokka_gene_ct,bakta_gene_ct,pgap_gene_ct,eggnog_gene_ct,
+                        prokka_hyps,bakta_hyps,pgap_hyps,eggnog_hyps,
+                        prokka_perc_hyps,bakta_perc_hyps,pgap_perc_hyps,eggnog_perc_hyps,
+                        prokka_CDS,bakta_CDS,pgap_CDS,
+                        prokka_rrna,bakta_rrna,pgap_rrna,
+                        prokka_trna,bakta_trna,pgap_trna,                        
+                        prokka_tmrna,bakta_tmrna,pgap_tmrna
+                        )
+colnames(summary_df) <- c("id","type",col_labels_genecount,col_labels_hyps,col_labels_perc_hyps,col_labels_CDS,col_labels_rrna,col_labels_trna,col_labels_tmrna)
+
+write_csv(summary_df,"comparison.csv")
diff --git a/configs/container.config b/configs/container.config
@@ -1,7 +1,8 @@
 process {
     withLabel:  ubuntu      { container = 'nanozoo/template:3.8--e13dfeb' }
-    withLabel:  bakta       { container = 'nanozoo/bakta:1.2.1--bf38720' } 
+    withLabel:  bakta       { container = 'nanozoo/bakta:1.6.1--c012822' } 
     withLabel:  prokka      { container = 'nanozoo/prokka:1.14.6--c99ff65' }
     withLabel:  eggnog      { container = 'nanozoo/eggnog-mapper:2.1.9--4f2b6c0' }
-    withLabel:  pgap        { container = 'ncbi/pgap:2022-10-03.build6384' }    
+    withLabel:  pgap        { container = 'ncbi/pgap:2022-10-03.build6384' }
+    withLabel:  r           { container = 'nanozoo/ggplot2:3.4.0--3367007' }    
     }
diff --git a/configs/local.config b/configs/local.config
@@ -1,10 +1,10 @@
 process.executor = 'local'
 
 process {
-    withLabel:  ubuntu      { cpus = 2 }
+    withLabel:  ubuntu      { cpus = params.cores }
     withLabel:  prokka      { cpus = params.cores }
     withLabel:  bakta       { cpus = params.cores } 
     withLabel:  eggnog      { cpus = params.cores } 
     withLabel:  pgap      { cpus = params.cores } 
-
+    withLabel:  r      { cpus = params.cores } 
     }
diff --git a/configs/nodes.config b/configs/nodes.config
@@ -1,7 +1,8 @@
 process {
-    withLabel:  ubuntu      { cpus = 2; memory = '2 GB' }
+    withLabel:  ubuntu      { cpus = 8; memory = '16 GB' }
     withLabel:  bakta       { cpus = 8; memory = '16 GB' }
     withLabel:  prokka      { cpus = 8; memory = '16 GB' }
     withLabel:  eggnog      { cpus = 8; memory = '16 GB' }
     withLabel:  pgap      { cpus = 8; memory = '16 GB' }
+    withLabel:  r      { cpus = 8; memory = '16 GB' }
     }
diff --git a/main.nf b/main.nf
@@ -68,7 +68,7 @@ csv_ch = Channel
 **************************/
 include {fasta_mod_wf} from './workflows/fasta_mod_wf.nf'
 include {annotation_wf} from './workflows/annotation_wf.nf'
-include {analysis} from './workflows/analysis_wf.nf'
+include {analysis_wf} from './workflows/analysis_wf.nf'
 //include {filter_wf} from './workflows/filter_wf.nf'
 
 
@@ -80,7 +80,7 @@ workflow {
 combined_fasta_ch=fasta_mod_wf(csv_ch)
 //annotation_wf(fasta_mod_wf.out.combined_fasta_ch)
 annotation_wf(combined_fasta_ch)
-analysis(annotation_wf.out.combined_annotation_ch)
+analysis_wf(annotation_wf.out.combined)
 }
 
 

diff --git a/workflows/analysis_wf.nf b/workflows/analysis_wf.nf
@@ -1,18 +1,69 @@
-include { analysis } from './process/analyis.nf' 
-
-workflow analysis{
+include { analysis} from './process/analysis.nf' 
+include { analysis as analysis_org} from './process/analysis.nf' 
+include { analysis as analysis_noise} from './process/analysis.nf' 
+include { analysis as analysis_split} from './process/analysis.nf' 
+workflow analysis_wf{
     take: 
-        combined_annotation_ch 
+    combined
     main: 
+    //OTHER APPROACH
+    // branched_ch = combined.branch {
+    //          original: it[1][0] == "original"
+    //          noise: it[1][0] == "noise"
+    //          split: it[1][0] == "split"
+    //      }
 
-    //WIP filter channel to only contain original, noise, split for analysis 
-    combined_annotation_ch.filter(
+    // original_grouped = branched_ch.original.groupTuple(by:0).view()
+    // noise_grouped = branched_ch.noise.groupTuple(by:0)
+    // split_grouped = branched_ch.split.groupTuple(by:0)
+    // analysis(original_grouped)
+    //END OF OTHER APPROACH
+    //org_ch=combined.filter{ it -> it[1][0][0] == 'noise'}.view()
+    //gives prokka
+    //combined.map { it -> tuple (it[1]) }.view()
+    //gives bakta
+    //combined.map { it -> tuple (it[2]) }.view()
+
+    original=combined.map { it ->
+        tuple( it[0], it[1].findAll { it.first() == "original" },it[2].findAll { it.first() == "original" },
+        it[3].findAll { it.first() == "original" },it[4].findAll { it.first() == "original" } )
+    }
 
-    analysis(combined_annotation_ch)
+    noise=combined.map { it ->
+        tuple( it[0], it[1].findAll { it.first() == "noise" },it[2].findAll { it.first() == "noise" },
+        it[3].findAll { it.first() == "noise" },it[4].findAll { it.first() == "noise" } )
+    }
 
+    split=combined.map { it ->
+        tuple( it[0], it[1].findAll { it.first() == "split" },it[2].findAll { it.first() == "split" },
+        it[3].findAll { it.first() == "split" },it[4].findAll { it.first() == "split" } )
+    }
+    original_proc=original.map { it -> tuple(it[0],
+    it[1].flatten()[1],it[1].flatten()[2],it[1].flatten()[3],
+    it[2].flatten()[1],it[2].flatten()[2],it[2].flatten()[3],
+    it[3].flatten()[1],it[3].flatten()[2],it[3].flatten()[3],
+    it[4].flatten()[1],it[4].flatten()[2],it[4].flatten()[3])}
+
+    noise=noise.map { it -> tuple(it[0],
+    it[1].flatten()[1],it[1].flatten()[2],it[1].flatten()[3],
+    it[2].flatten()[1],it[2].flatten()[2],it[2].flatten()[3],
+    it[3].flatten()[1],it[3].flatten()[2],it[3].flatten()[3],
+    it[4].flatten()[1],it[4].flatten()[2],it[4].flatten()[3])}
+
+    split=split.map { it -> tuple(it[0],
+    it[1].flatten()[1],it[1].flatten()[2],it[1].flatten()[3],
+    it[2].flatten()[1],it[2].flatten()[2],it[2].flatten()[3],
+    it[3].flatten()[1],it[3].flatten()[2],it[3].flatten()[3],
+    it[4].flatten()[1],it[4].flatten()[2],it[4].flatten()[3])}
+
+    report_original=analysis_org(original_proc,'original')
+    report_noise=analysis_noise(noise,'noise')
+    report_split=analysis_split(split,'split')
 
-    //emit: 
+    report_original.mix(report_noise).mix(report_split).collectFile(name: 'collected_reports.csv', keepHeader: true ,storeDir: '${params.output}/final_report/')
 
+    emit: 
+    combined
 
 }