Wdlupdate (#3)

* added cram-to-bam. updated pairedtoubam to use gatk4. removed gatk software requirement because repo will contain more than one wdl which may use different versions.
gatk-workflows · Feb 23, 2018 · 413fca8 · 413fca8
1 parent 03f84a6
commit 413fca8
Show file tree

Hide file tree

Showing 5 changed files with 215 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,35 @@
 # seq-format-conversion
 Workflows for converting between sequence data formats
+
+### cram-to-bam :
+This script should convert a CRAM to SAM to BAM and output a BAM, BAM Index, 
+and validation report to a Google bucket. If you'd like to do this on multiple CRAMS, 
+create a sample set in the Data tab.  
+The reason this approach was chosen instead of converting CRAM to BAM directly 
+using Samtools is because Samtools 1.3 produces incorrect bins due to an old version of htslib 
+included in the package. Samtools versions 1.4 & 1.5 have an NM issue that 
+causes them to not validate  with Picard. 
+
+#### Requirements/expectations
+- Cram file 
+
+#### Outputs 
+- Bam file and index
+- Validation report
+
+### paired-fastq-to-unmapped-bam :
+This WDL converts paired FASTQ to uBAM and adds read group information 
+
+#### Requirements/expectations
+- Pair-end sequencing data in FASTQ format (one file per orientation)
+- One or more read groups, one per pair of FASTQ files 
+
+#### Outputs 
+- Set of unmapped BAMs, one per read group
+
+
+### Software version requirements :
+Cromwell version support 
+- Successfully tested on v30.2
+- Does not work on versions < v23 due to output syntax
+
diff --git a/cram-to-bam.inputs.json b/cram-to-bam.inputs.json
@@ -0,0 +1,25 @@
+{
+  "##_COMMENT1": "INPUTS",
+  "CramToBamFlow.CramToBamTask.SampleName": "NA12878",
+  "CramToBamFlow.CramToBamTask.InputCram": "gs://gatk-test-data/wgs_cram/NA12878_20k_hg38/NA12878.cram",
+
+  "##_COMMENT2": "REFERENCES",
+  "CramToBamFlow.CramToBamTask.RefDict": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict",
+  "CramToBamFlow.CramToBamTask.RefFasta": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta",
+  "CramToBamFlow.CramToBamTask.RefIndex": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
+
+  "##_COMMENT3": "DOCKER",
+  "CramToBamFlow.ValidateSamFile.docker_image": "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817",
+  "CramToBamFlow.CramToBamTask.docker_image": "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817",
+
+  "##_COMMENT4": "DISK SIZE",
+  "CramToBamFlow.validate_sam_file_disk_size": "200",
+  "CramToBamFlow.cram_to_bam_disk_size": "200",
+
+  "##_COMMENT3": "MEMORY",
+  "CramToBamFlow.validate_sam_file_mem_size": "3500 MB",
+  "CramToBamFlow.cram_to_bam_mem_size": "15 GB",
+
+  "##_COMMENT3": "PREEMPTIBLES",
+  "CramToBamFlow.ValidateSamFile.preemptible_tries": "3"
+}
diff --git a/cram-to-bam.wdl b/cram-to-bam.wdl
@@ -0,0 +1,135 @@
+## Copyright Broad Institute, 2017
+## This script should convert a CRAM to SAM to BAM and output a BAM, BAM Index, and validation report to a Google bucket. If you'd like to do ## this on multiple CRAMS, create a sample set in the Data tab.  
+## The reason this approach was chosen instead of converting CRAM to BAM directly using Samtools is because Samtools 1.3 produces incorrect 
+## bins due to an old version of htslib included in the package. Samtools versions 1.4 & 1.5 have an NM issue that causes them to not validate ## with Picard. 
+## 
+## TESTED: It was tested using the Genomes in the Cloud Docker image version 2.3.1-1500064817. 
+## Versions of other tools on this image at the time of testing:
+## PICARD_VER=1.1150
+## GATK34_VER=3.4-g3c929b0
+## GATK35_VER=3.5-0-g36282e4
+## GATK36_VER=3.6-44-ge7d1cd2
+## GATK4_VER=4.beta.1
+## SAMTOOLS_VER=1.3.1
+## BWA_VER=0.7.15.r1140
+## TABIX_VER=0.2.5_r1005
+## BGZIP_VER=1.3
+## SVTOOLKIT_VER=2.00-1650
+## It was tested pulling the HG38 reference Fasta and Fai.
+## Successfully tested on Cromwell version 28. Does not work on versions < v23 due to output syntax 
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. 
+##
+## LICENSING : This script is released under the WDL source code license (BSD-3) (see LICENSE in https://github.com/broadinstitute/wdl). 
+## Note however that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script. 
+## Please see the docker for detailed licensing information pertaining to the included programs.
+##
+#WORKFLOW DEFINITION
+workflow CramToBamFlow {
+Int cram_to_bam_disk_size
+Int validate_sam_file_disk_size
+String cram_to_bam_mem_size
+String validate_sam_file_mem_size
+
+
+#converts CRAM to SAM to BAM and makes BAI
+call CramToBamTask{
+	input:
+	disk_size = cram_to_bam_disk_size,
+	mem_size = cram_to_bam_mem_size	
+}
+
+#validates Bam
+call ValidateSamFile{
+	input:
+	input_bam = CramToBamTask.outputBam,
+    disk_size = validate_sam_file_disk_size,
+    mem_size = validate_sam_file_mem_size	
+}
+
+#Outputs Bam, Bai, and validation report to the FireCloud data model
+output {
+    File outputBam = CramToBamTask.outputBam
+    File outputBai = CramToBamTask.outputBai
+    File validation_report = ValidateSamFile.report
+  }
+
+}
+
+#Task Definitions
+task CramToBamTask {
+
+File RefFasta
+File RefIndex
+File RefDict
+File InputCram
+#File InputCrai
+String SampleName
+Int disk_size
+String mem_size
+String docker_image
+
+
+#Calls samtools view to do the conversion
+command {
+#Set -e and -o says if any command I run fails in this script, make sure to return a failure
+set -e
+set -o pipefail
+
+samtools view -h -T ${RefFasta} ${InputCram} |
+samtools view -b -o ${SampleName}.bam -
+samtools index -b ${SampleName}.bam
+mv ${SampleName}.bam.bai ${SampleName}.bai
+}
+
+#Run time attributes:
+#Use a docker with samtools. Set this up as a workspace attribute.
+#cpu of one because no multi-threading is required. This is also default, so don't need to specify.
+#disk_size should equal input size + output size + buffer
+
+runtime {
+    docker: docker_image
+    memory: mem_size
+    cpu: "1"
+    disks: "local-disk " + disk_size + " HDD"
+	}
+
+#Outputs a BAM and BAI with the same sample name
+output {
+	File outputBam = "${SampleName}.bam"
+	File outputBai = "${SampleName}.bai"
+	}
+}
+
+#Validates BAM output to ensure it wasn't corrupted during the file conversion
+task ValidateSamFile {
+  File input_bam
+  String output_name = basename(input_bam, ".bam") + ".validation_report"
+  Int disk_size
+  String mem_size
+  Int preemptible_tries
+  String docker_image
+
+  command {
+    java -Xmx3000m -jar /usr/gitc/picard.jar \
+      ValidateSamFile \
+      INPUT=${input_bam} \
+      OUTPUT=${output_name} \
+      MODE=SUMMARY \
+      IS_BISULFITE_SEQUENCED=false 
+  }
+  #Run time attributes:
+  #Use a docker with the picard.jar. Set this up as a workspace attribute.
+  #Read more about return codes here: https://github.com/broadinstitute/cromwell#continueonreturncode
+		runtime {
+    docker: docker_image
+    memory: mem_size
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+    continueOnReturnCode: [0,1]
+  }
+  #A text file is generated that will list errors or warnings that apply. 
+  output {
+    File report = "${output_name}"
+  }
+}
+
diff --git a/paired-fastq-to-unmapped-bam.inputs.json b/paired-fastq-to-unmapped-bam.inputs.json
@@ -1,4 +1,5 @@
 {
+  "##_Comment1": "Inputs",
   "ConvertPairedFastQsToUnmappedBamWf.readgroup_list": [
   	"NA12878_A", "NA12878_B", "NA12878_C"
   ],
@@ -27,6 +28,13 @@
   		"gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_2.fastq"
   	]
   },
+
+  "##_Comment2": "GATK Luanch Path",
+  "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.gatk_path": "/gatk/gatk",
+
+  "##_Comment3": "Runtime Parameters",
+  "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.docker": "broadinstitute/gatk:4.0.1.2",
   "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.mem_size": "1 GB",
   "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.disk_size": 200
 }
+
diff --git a/paired-fastq-to-unmapped-bam.wdl b/paired-fastq-to-unmapped-bam.wdl
@@ -69,23 +69,25 @@ task PairedFastQsToUnmappedBAM {
   String sequencing_center
   Int disk_size
   String mem_size
+  String docker
+  String gatk_path
 
   command {
-    java -Xmx3000m -jar /usr/gitc/picard.jar \
+    ${gatk_path} --java-options "-Xmx3000m" \
       FastqToSam \
-      FASTQ=${fastq_1} \
-      FASTQ2=${fastq_2} \
-      OUTPUT=${readgroup_name}.bam \
-      READ_GROUP_NAME=${readgroup_name} \
-      SAMPLE_NAME=${sample_name} \
-      LIBRARY_NAME=${library_name} \
-      PLATFORM_UNIT=${platform_unit} \
-      RUN_DATE=${run_date} \
-      PLATFORM=${platform_name} \
-      SEQUENCING_CENTER=${sequencing_center} 
+      --FASTQ ${fastq_1} \
+      --FASTQ2 ${fastq_2} \
+      --OUTPUT ${readgroup_name}.unmapped.bam \
+      --READ_GROUP_NAME ${readgroup_name} \
+      --SAMPLE_NAME ${sample_name} \
+      --LIBRARY_NAME ${library_name} \
+      --PLATFORM_UNIT ${platform_unit} \
+      --RUN_DATE ${run_date} \
+      --PLATFORM ${platform_name} \
+      --SEQUENCING_CENTER ${sequencing_center} 
   }
   runtime {
-    docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
+    docker: docker
     memory: mem_size
     cpu: "1"
     disks: "local-disk " + disk_size + " HDD"