Skip to content

Commit

Permalink
Wdlupdate (#4)
Browse files Browse the repository at this point in the history
* added cram-to-bam. updated pairedtoubam to use gatk4

* Update README.md

* removed gatk software requiremnt because repo will contain more than one wdl which may use different versions.

* Wdl now uses a readgroup tsv file as input. Added task to compose a file containing a list of the generated ubams

* added bam-to-unmapped-bams wdl

* changed to use latest gatk docker

* fastq to bam now uses arrays as input

* updated descriptor for paired fastq to bam

* added a firecloud version for fastq to Ubam

* chaged pairedfastq2Ubam docker to gcr
  • Loading branch information
bshifaw authored Jul 13, 2018
1 parent 413fca8 commit 8f36d69
Show file tree
Hide file tree
Showing 6 changed files with 444 additions and 69 deletions.
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,32 @@ causes them to not validate with Picard.
### paired-fastq-to-unmapped-bam :
This WDL converts paired FASTQ to uBAM and adds read group information

*NOTE: paired-fastq-to-unmapped-bam-fc.wdl is a slightly modified version of the original to support users interested running on FireCloud.
As input this wdl takes a TSV with each row being a different readgroup and each column in the row being descriptors*

#### Requirements/expectations
- Pair-end sequencing data in FASTQ format (one file per orientation)
- One or more read groups, one per pair of FASTQ files
- The following metada descriptors per sample:
```
readgroup fastq_pair1_file_path fastq_pair2_file_path sample_name library_name platform_unit run_date platform_name sequecing_center
```

#### Outputs
- Set of unmapped BAMs, one per read group
- File containing a list of the generated unmapped BAMs

### bam-to-unmapped-bams :
This WDL converts BAM to unmapped BAMs

#### Requirements/expectations
- BAM file

#### Outputs
- Sorted Unmapped BAMs

### Software version requirements :
Cromwell version support
- Successfully tested on v30.2
- Successfully tested on v32
- Does not work on versions < v23 due to output syntax

Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
6 changes: 6 additions & 0 deletions bam-to-unmapped-bams.inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

{
"##_COMMENT1": "INPUTS",
"BamToUnmappedBams.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_hg38/NA12878.bam"
}

164 changes: 164 additions & 0 deletions bam-to-unmapped-bams.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
## Copyright Broad Institute, 2018
##
## This WDL converts BAM to unmapped BAMs
##
## Requirements/expectations :
## - BAM file
##
## Outputs :
## - Sorted Unmapped BAMs
##
## Cromwell version support
## - Successfully tested on v33
## - Does not work on versions < v23 due to output syntax
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
## For program versions, see docker containers.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
## licensing information pertaining to the included programs.
# WORKFLOW DEFINITION
workflow BamToUnmappedBams {
File input_bam

Int? additional_disk_size
Int additional_disk = select_first([additional_disk_size, 20])

Float input_size = size(input_bam, "GB")

String? gatk_path
String path2gatk = select_first([gatk_path, "/gatk/gatk"])

String? gitc_docker
String gitc_image = select_first([gitc_docker, "broadinstitute/genomes-in-the-cloud:2.3.1-1512499786"])
String? gatk_docker
String gatk_image = select_first([gatk_docker, "broadinstitute/gatk:latest"])

call GenerateOutputMap {
input:
input_bam = input_bam,
disk_size = ceil(input_size) + additional_disk,
docker = gitc_image
}

call RevertSam {
input:
input_bam = input_bam,
output_map = GenerateOutputMap.output_map,
disk_size = ceil(input_size * 3) + additional_disk,
docker = gatk_image,
gatk_path = path2gatk
}

scatter (unmapped_bam in RevertSam.unmapped_bams) {
String output_basename = basename(unmapped_bam, ".coord.sorted.unmapped.bam")
Float unmapped_bam_size = size(unmapped_bam, "GB")

call SortSam {
input:
input_bam = unmapped_bam,
sorted_bam_name = output_basename + ".unmapped.bam",
disk_size = ceil(unmapped_bam_size * 6) + additional_disk,
docker = gatk_image,
gatk_path = path2gatk
}
}

output {
Array[File] output_bams = SortSam.sorted_bam
}
}

task GenerateOutputMap {
File input_bam
Int disk_size

String docker

command {
set -e

samtools view -H ${input_bam} | grep @RG | cut -f2 | sed s/ID:// > readgroups.txt

echo -e "READ_GROUP_ID\tOUTPUT" > output_map.tsv

for rg in `cat readgroups.txt`; do
echo -e "$rg\t$rg.coord.sorted.unmapped.bam" >> output_map.tsv
done
}

runtime {
docker: docker
disks: "local-disk " + disk_size + " HDD"
preemptible: "3"
memory: "1 GB"
}
output {
File output_map = "output_map.tsv"
}
}

task RevertSam {
File input_bam
File output_map
Int disk_size

String gatk_path

String docker

command {
${gatk_path} --java-options "-Xmx1000m" \
RevertSam \
--INPUT ${input_bam} \
--OUTPUT_MAP ${output_map} \
--OUTPUT_BY_READGROUP true \
--VALIDATION_STRINGENCY LENIENT \
--ATTRIBUTE_TO_CLEAR FT \
--ATTRIBUTE_TO_CLEAR CO \
--SORT_ORDER coordinate
}
runtime {
docker: docker
disks: "local-disk " + disk_size + " HDD"
memory: "1200 MB"
}
output {
Array[File] unmapped_bams = glob("*.bam")
}
}

task SortSam {
File input_bam
String sorted_bam_name
Int disk_size

String gatk_path

String docker

command {
${gatk_path} --java-options "-Xmx3000m" \
SortSam \
--INPUT ${input_bam} \
--OUTPUT ${sorted_bam_name} \
--SORT_ORDER queryname \
--MAX_RECORDS_IN_RAM 1000000
}
runtime {
docker: docker
disks: "local-disk " + disk_size + " HDD"
memory: "3500 MB"
preemptible: 3
}
output {
File sorted_bam = "${sorted_bam_name}"
}
}

145 changes: 145 additions & 0 deletions paired-fastq-to-unmapped-bam-fc.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
## Copyright Broad Institute, 2018
##
## This WDL converts paired FASTQ to uBAM and adds read group information
##
## Requirements/expectations :
## - Pair-end sequencing data in FASTQ format (one file per orientation)
## - One or more read groups, one per pair of FASTQ files
## - A readgroup.list file with the following format :
## ``readgroup fastq_pair1 fastq_pair2 sample_name library_name platform_unit run_date platform_name sequecing_center``
##
## Outputs :
## - Set of unmapped BAMs, one per read group
## - File of a list of the generated unmapped BAMs
##
## Cromwell version support
## - Successfully tested on v32
## - Does not work on versions < v23 due to output syntax
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
## For program versions, see docker containers.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
## licensing information pertaining to the included programs.
# WORKFLOW DEFINITION
workflow ConvertPairedFastQsToUnmappedBamWf {
File readgroup_list
Array[Array[String]] readgroup_array = read_tsv(readgroup_list)
String ubam_list_name = basename(readgroup_list,".list") + "unmapped.bam.list"

String? gatk_docker_override
String gatk_docker = select_first([gatk_docker_override, "us.gcr.io/broad-gatk/gatk:latest"])
String? gatk_path_override
String gatk_path = select_first([gatk_path_override, "/gatk/gatk"])
Int? preemptible_attempts

# Convert multiple pairs of input fastqs in parallel
scatter (i in range(length(readgroup_array))) {

# Convert pair of FASTQs to uBAM
call PairedFastQsToUnmappedBAM {
input:
fastq_1 = readgroup_array[i][1],
fastq_2 = readgroup_array[i][2],
readgroup_name = readgroup_array[i][0],
sample_name = readgroup_array[i][3],
library_name = readgroup_array[i][4],
platform_unit = readgroup_array[i][5],
run_date = readgroup_array[i][6],
platform_name = readgroup_array[i][7],
sequencing_center = readgroup_array[i][8],
gatk_path = gatk_path,
docker = gatk_docker,
preemptible_attempts = preemptible_attempts
}
}

#Create a file with a list of the generated ubams
call CreateFoFN {
input:
array_of_files = PairedFastQsToUnmappedBAM.output_bam,
fofn_name = ubam_list_name,
docker = gatk_docker
}

# Outputs that will be retained when execution is complete
output {
Array[File] output_bams = PairedFastQsToUnmappedBAM.output_bam
File unmapped_bam_list = CreateFoFN.fofn_list
}
}

# TASK DEFINITIONS
# Convert a pair of FASTQs to uBAM
task PairedFastQsToUnmappedBAM {
# Command parameters
String sample_name
File fastq_1
File fastq_2
String readgroup_name
String library_name
String platform_unit
String run_date
String platform_name
String sequencing_center

# Runtime parameters
Int? disk_space_gb
Int? machine_mem_gb
Int? preemptible_attempts
String docker
String gatk_path

command {
${gatk_path} --java-options "-Xmx3000m" \
FastqToSam \
--FASTQ ${fastq_1} \
--FASTQ2 ${fastq_2} \
--OUTPUT ${readgroup_name}.unmapped.bam \
--READ_GROUP_NAME ${readgroup_name} \
--SAMPLE_NAME ${sample_name} \
--LIBRARY_NAME ${library_name} \
--PLATFORM_UNIT ${platform_unit} \
--RUN_DATE ${run_date} \
--PLATFORM ${platform_name} \
--SEQUENCING_CENTER ${sequencing_center}
}
runtime {
docker: docker
memory: select_first([machine_mem_gb, 10]) + " GB"
cpu: "1"
disks: "local-disk " + select_first([disk_space_gb, 100]) + " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_bam = "${readgroup_name}.unmapped.bam"
}
}

task CreateFoFN {
# Command parameters
Array[String] array_of_files
String fofn_name

# Runtime parameters
String docker

command {
mv ${write_lines(array_of_files)} ${fofn_name}.list
}
output {
File fofn_list = "${fofn_name}.list"
}
runtime {
docker: docker
preemptible: 3
}
}

Loading

0 comments on commit 8f36d69

Please sign in to comment.