Dev (#8)

* added cram-to-bam. updated pairedtoubam to use gatk4 * Update README.md removed gatk software requiremnt because repo will contain more than one wdl which may use different versions. * Update README.md * Wdl now uses a readgroup tsv file as input. Added task to compose a file containing a list of the generated ubams * minor * minor * minor edits * corrected memory placement * minor edits * added bam-to-unmapped-bams wdl * fixed comment number * changed to use latest gatk docker * fastq to bam now uses arrays as input * updated descriptor for paired fastq to bam * updated inpute in description * added a firecloud version for fastq to Ubam * minor format changes. chaged pairedfastq2Ubam docker to gcr * Minor update to ReadMe, added default docker to cram2bam * decreased mem size in cram2bam to reduce cost * Updated WDL to 1.0, removed GenerateOutputMap from bam2ubam, removed fc version of fastq2ubam, added defaults to cram2bam * Updated WDL to 1.0, removed GenerateOutputMap from bam2ubam, added defaults to cram2bam * minor format change to Readme * replaced '$' with '~'
gatk-workflows · Dec 19, 2019 · 088149a · 088149a
1 parent 03b6522
commit 088149a
Show file tree

Hide file tree

Showing 10 changed files with 247 additions and 436 deletions.
diff --git a/README.md b/README.md
@@ -20,16 +20,17 @@ causes them to not validate  with Picard.
 ### paired-fastq-to-unmapped-bam :
 This WDL converts paired FASTQ to uBAM and adds read group information 
 
-*NOTE: paired-fastq-to-unmapped-bam-fc.wdl is a slightly modified version of the original to support users interested running on FireCloud. 
-As input this wdl takes a TSV with each row being a different readgroup and each column in the row being descriptors*
-
 #### Requirements/expectations 
 - Pair-end sequencing data in FASTQ format (one file per orientation)
 - The following metada descriptors per sample: 
-```
-readgroup   fastq_pair1_file_path   fastq_pair2_file_path   sample_name   library_name   platform_unit   run_date   platform_name   sequecing_center
-```  
-
+  - readgroup   
+  - sample_name
+  - library_name
+  - platform_unit
+  - run_date
+  - platform_name
+  - sequecing_center
+
 #### Outputs 
 - Set of unmapped BAMs, one per read group
 - File containing a list of the generated unmapped BAMs 
@@ -42,6 +43,19 @@ This WDL converts BAM  to unmapped BAMs
 
 #### Outputs 
 - Sorted Unmapped BAMs
+- Text file listing the unmapped file paths (FOFN)
+
+### interleaved-fastq-to-paired-fastq :
+This WDL takes in a single interleaved(R1+R2) FASTQ file and separates it into 
+separate R1 and R2 FASTQ (i.e. paired FASTQ) files. Paired FASTQ files are the input 
+format for the tool that generates unmapped BAMs (the format used in most 
+GATK processing and analysis tools).
+
+#### Requirements/expectations 
+- Interleaved Fastq file
+
+#### Outputs 
+- Separate R1 and R2 FASTQ files (i.e. paired FASTQ)
 
 ### interleaved-fastq-to-paired-fastq :
 This WDL takes in a single interleaved(R1+R2) FASTQ file and separates it into separate R1 and R2 FASTQ (i.e. paired FASTQ) files. Paired FASTQ files are the input format for the tool that generates unmapped BAMs (the format used in most GATK processing and analysis tools).
@@ -57,7 +71,7 @@ This WDL takes in a single interleaved(R1+R2) FASTQ file and separates it into s
 - Samtools 1.3.1
 - Picard 2.8.3
 - Cromwell version support 
-  - Successfully tested on v32
+  - Successfully tested on v47
   - Does not work on versions < v23 due to output syntax
 
 ### Important Note :

diff --git a/bam-to-unmapped-bams.inputs.json b/bam-to-unmapped-bams.inputs.json
@@ -1,6 +1,5 @@
 
 {
-  "##_COMMENT1": "INPUTS",  
   "BamToUnmappedBams.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_hg38/NA12878.bam"
 }
 
diff --git a/bam-to-unmapped-bams.wdl b/bam-to-unmapped-bams.wdl
@@ -1,3 +1,4 @@
+version 1.0
 ## Copyright Broad Institute, 2018
 ## 
 ## This WDL converts BAM  to unmapped BAMs
@@ -25,35 +26,21 @@
 
 # WORKFLOW DEFINITION
 workflow BamToUnmappedBams {
-  File input_bam
+  input {
+    File input_bam
 
-  Int? additional_disk_size
-  Int additional_disk = select_first([additional_disk_size, 20])
-
-  Float input_size = size(input_bam, "GB")
-
-  String? gatk_path
-  String path2gatk = select_first([gatk_path, "/gatk/gatk"])
-
-  String? gitc_docker
-  String gitc_image = select_first([gitc_docker, "broadinstitute/genomes-in-the-cloud:2.3.1-1512499786"])
-  String? gatk_docker 
-  String gatk_image = select_first([gatk_docker, "broadinstitute/gatk:latest"])
-
-  call GenerateOutputMap {
-    input:
-      input_bam = input_bam,
-      disk_size = ceil(input_size) + additional_disk,
-      docker = gitc_image
+    Int additional_disk_size = 20
+    String gatk_docker = "broadinstitute/gatk:latest"
+    String gatk_path = "/gatk/gatk"
   }
-
+    Float input_size = size(input_bam, "GB")
+
   call RevertSam {
     input:
       input_bam = input_bam,
-      output_map = GenerateOutputMap.output_map,
-      disk_size = ceil(input_size * 3) + additional_disk,
-      docker = gatk_image,
-      gatk_path = path2gatk
+      disk_size = ceil(input_size * 3) + additional_disk_size,
+      docker = gatk_docker,
+      gatk_path = gatk_path
   }
 
   scatter (unmapped_bam in RevertSam.unmapped_bams) {
@@ -64,9 +51,9 @@ workflow BamToUnmappedBams {
       input:
         input_bam = unmapped_bam,
         sorted_bam_name = output_basename + ".unmapped.bam",
-        disk_size = ceil(unmapped_bam_size * 6) + additional_disk,
-        docker = gatk_image,
-        gatk_path = path2gatk
+        disk_size = ceil(unmapped_bam_size * 6) + additional_disk_size,
+        docker = gatk_docker,
+        gatk_path = gatk_path
     }
   }
 
@@ -75,49 +62,26 @@ workflow BamToUnmappedBams {
   }
 }
 
-task GenerateOutputMap {
-  File input_bam
-  Int disk_size
-
-  String docker
-
-  command {
-    set -e
-
-    samtools view -H ${input_bam} | grep @RG | cut -f2 | sed s/ID:// > readgroups.txt
-
-    echo -e "READ_GROUP_ID\tOUTPUT" > output_map.tsv
-
-    for rg in `cat readgroups.txt`; do
-      echo -e "$rg\t$rg.coord.sorted.unmapped.bam" >> output_map.tsv
-    done
-  }
-
-  runtime {
-    docker: docker
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: "3"
-    memory: "1 GB"
-  }
-  output {
-    File output_map = "output_map.tsv"
-  }
-}
-
 task RevertSam {
-  File input_bam
-  File output_map
-  Int disk_size
-
-  String gatk_path
-
-  String docker
+  input {
+    #Command parameters
+    File input_bam
+    String gatk_path
+
+    #Runtime parameters
+    Int disk_size
+    String docker
+    Int machine_mem_gb = 2
+    Int preemptible_attempts = 3
+  }
+    Int command_mem_gb = machine_mem_gb - 1    ####Needs to occur after machine_mem_gb is set 
 
-  command {
-    ${gatk_path} --java-options "-Xmx1000m" \
+  command { 
+
+    ~{gatk_path} --java-options "-Xmx~{command_mem_gb}g" \
     RevertSam \
-    --INPUT ${input_bam} \
-    --OUTPUT_MAP ${output_map} \
+    --INPUT ~{input_bam} \
+    --OUTPUT ./ \
     --OUTPUT_BY_READGROUP true \
     --VALIDATION_STRINGENCY LENIENT \
     --ATTRIBUTE_TO_CLEAR FT \
@@ -127,38 +91,44 @@ task RevertSam {
   runtime {
     docker: docker
     disks: "local-disk " + disk_size + " HDD"
-    memory: "1200 MB"
+    memory: machine_mem_gb + " GB"
+    preemptible: preemptible_attempts
   }
   output {
     Array[File] unmapped_bams = glob("*.bam")
   }
 }
 
 task SortSam {
-  File input_bam
-  String sorted_bam_name
-  Int disk_size
-
-  String gatk_path
-
-  String docker
+  input {
+    #Command parameters
+    File input_bam
+    String sorted_bam_name
+    #Runtime parameters
+    String gatk_path
+    Int disk_size
+    String docker
+    Int machine_mem_gb = 4
+    Int preemptible_attempts = 3
+  }
+    Int command_mem_gb = machine_mem_gb - 1    ####Needs to occur after machine_mem_gb is set 
 
   command {
-    ${gatk_path} --java-options "-Xmx3000m" \
+    ~{gatk_path} --java-options "-Xmx~{command_mem_gb}g" \
     SortSam \
-    --INPUT ${input_bam} \
-    --OUTPUT ${sorted_bam_name} \
+    --INPUT ~{input_bam} \
+    --OUTPUT ~{sorted_bam_name} \
     --SORT_ORDER queryname \
     --MAX_RECORDS_IN_RAM 1000000
   }
   runtime {
     docker: docker
     disks: "local-disk " + disk_size + " HDD"
-    memory: "3500 MB"
-    preemptible: 3
+    memory: machine_mem_gb + " GB"
+    preemptible: preemptible_attempts
   }
   output {
-    File sorted_bam = "${sorted_bam_name}"
+    File sorted_bam = "~{sorted_bam_name}"
   }
 }
 
diff --git a/cram-to-bam.inputs.json b/cram-to-bam.inputs.json
@@ -1,24 +1,8 @@
 {
-  "##_COMMENT1": "INPUTS",
-  "CramToBamFlow.CramToBamTask.SampleName": "NA12878",
-  "CramToBamFlow.CramToBamTask.InputCram": "gs://gatk-test-data/wgs_cram/NA12878_20k_hg38/NA12878.cram",
+  "CramToBamFlow.CramToBamTask.sample_name": "NA12878",
+  "CramToBamFlow.CramToBamTask.input_cram": "gs://gatk-test-data/wgs_cram/NA12878_20k_hg38/NA12878.cram",
 
-  "##_COMMENT2": "REFERENCES",
-  "CramToBamFlow.CramToBamTask.RefDict": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict",
-  "CramToBamFlow.CramToBamTask.RefFasta": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta",
-  "CramToBamFlow.CramToBamTask.RefIndex": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
-
-  "##_COMMENT3": "DOCKER",
-  "CramToBamFlow.gotc_docker_override": "String optional",
-
-  "##_COMMENT4": "DISK SIZE",
-  "CramToBamFlow.validate_sam_file_disk_size": "200",
-  "CramToBamFlow.cram_to_bam_disk_size": "200",
-
-  "##_COMMENT3": "MEMORY",
-  "CramToBamFlow.validate_sam_file_mem_size": "3750 MB",
-  "CramToBamFlow.cram_to_bam_mem_size": "3.75 GB",
-
-  "##_COMMENT3": "PREEMPTIBLES",
-  "CramToBamFlow.ValidateSamFile.preemptible_tries": "3"
+  "CramToBamFlow.CramToBamTask.ref_dict": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict",
+  "CramToBamFlow.CramToBamTask.ref_fasta": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta",
+  "CramToBamFlow.CramToBamTask.ref_fasta_index": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai"
 }