-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* added cram-to-bam. updated pairedtoubam to use gatk4 * Update README.md * removed gatk software requiremnt because repo will contain more than one wdl which may use different versions. * Wdl now uses a readgroup tsv file as input. Added task to compose a file containing a list of the generated ubams * added bam-to-unmapped-bams wdl * changed to use latest gatk docker * fastq to bam now uses arrays as input * updated descriptor for paired fastq to bam * added a firecloud version for fastq to Ubam * chaged pairedfastq2Ubam docker to gcr
- Loading branch information
bshifaw
authored
Jul 13, 2018
1 parent
413fca8
commit 8f36d69
Showing
6 changed files
with
444 additions
and
69 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
|
||
{ | ||
"##_COMMENT1": "INPUTS", | ||
"BamToUnmappedBams.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_hg38/NA12878.bam" | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
## Copyright Broad Institute, 2018 | ||
## | ||
## This WDL converts BAM to unmapped BAMs | ||
## | ||
## Requirements/expectations : | ||
## - BAM file | ||
## | ||
## Outputs : | ||
## - Sorted Unmapped BAMs | ||
## | ||
## Cromwell version support | ||
## - Successfully tested on v33 | ||
## - Does not work on versions < v23 due to output syntax | ||
## | ||
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
## For program versions, see docker containers. | ||
## | ||
## LICENSING : | ||
## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
## be subject to different licenses. Users are responsible for checking that they are | ||
## authorized to run all programs before running this script. Please see the docker | ||
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
## licensing information pertaining to the included programs. | ||
# WORKFLOW DEFINITION | ||
workflow BamToUnmappedBams { | ||
File input_bam | ||
|
||
Int? additional_disk_size | ||
Int additional_disk = select_first([additional_disk_size, 20]) | ||
|
||
Float input_size = size(input_bam, "GB") | ||
|
||
String? gatk_path | ||
String path2gatk = select_first([gatk_path, "/gatk/gatk"]) | ||
|
||
String? gitc_docker | ||
String gitc_image = select_first([gitc_docker, "broadinstitute/genomes-in-the-cloud:2.3.1-1512499786"]) | ||
String? gatk_docker | ||
String gatk_image = select_first([gatk_docker, "broadinstitute/gatk:latest"]) | ||
|
||
call GenerateOutputMap { | ||
input: | ||
input_bam = input_bam, | ||
disk_size = ceil(input_size) + additional_disk, | ||
docker = gitc_image | ||
} | ||
|
||
call RevertSam { | ||
input: | ||
input_bam = input_bam, | ||
output_map = GenerateOutputMap.output_map, | ||
disk_size = ceil(input_size * 3) + additional_disk, | ||
docker = gatk_image, | ||
gatk_path = path2gatk | ||
} | ||
|
||
scatter (unmapped_bam in RevertSam.unmapped_bams) { | ||
String output_basename = basename(unmapped_bam, ".coord.sorted.unmapped.bam") | ||
Float unmapped_bam_size = size(unmapped_bam, "GB") | ||
|
||
call SortSam { | ||
input: | ||
input_bam = unmapped_bam, | ||
sorted_bam_name = output_basename + ".unmapped.bam", | ||
disk_size = ceil(unmapped_bam_size * 6) + additional_disk, | ||
docker = gatk_image, | ||
gatk_path = path2gatk | ||
} | ||
} | ||
|
||
output { | ||
Array[File] output_bams = SortSam.sorted_bam | ||
} | ||
} | ||
|
||
task GenerateOutputMap { | ||
File input_bam | ||
Int disk_size | ||
|
||
String docker | ||
|
||
command { | ||
set -e | ||
|
||
samtools view -H ${input_bam} | grep @RG | cut -f2 | sed s/ID:// > readgroups.txt | ||
|
||
echo -e "READ_GROUP_ID\tOUTPUT" > output_map.tsv | ||
|
||
for rg in `cat readgroups.txt`; do | ||
echo -e "$rg\t$rg.coord.sorted.unmapped.bam" >> output_map.tsv | ||
done | ||
} | ||
|
||
runtime { | ||
docker: docker | ||
disks: "local-disk " + disk_size + " HDD" | ||
preemptible: "3" | ||
memory: "1 GB" | ||
} | ||
output { | ||
File output_map = "output_map.tsv" | ||
} | ||
} | ||
|
||
task RevertSam { | ||
File input_bam | ||
File output_map | ||
Int disk_size | ||
|
||
String gatk_path | ||
|
||
String docker | ||
|
||
command { | ||
${gatk_path} --java-options "-Xmx1000m" \ | ||
RevertSam \ | ||
--INPUT ${input_bam} \ | ||
--OUTPUT_MAP ${output_map} \ | ||
--OUTPUT_BY_READGROUP true \ | ||
--VALIDATION_STRINGENCY LENIENT \ | ||
--ATTRIBUTE_TO_CLEAR FT \ | ||
--ATTRIBUTE_TO_CLEAR CO \ | ||
--SORT_ORDER coordinate | ||
} | ||
runtime { | ||
docker: docker | ||
disks: "local-disk " + disk_size + " HDD" | ||
memory: "1200 MB" | ||
} | ||
output { | ||
Array[File] unmapped_bams = glob("*.bam") | ||
} | ||
} | ||
|
||
task SortSam { | ||
File input_bam | ||
String sorted_bam_name | ||
Int disk_size | ||
|
||
String gatk_path | ||
|
||
String docker | ||
|
||
command { | ||
${gatk_path} --java-options "-Xmx3000m" \ | ||
SortSam \ | ||
--INPUT ${input_bam} \ | ||
--OUTPUT ${sorted_bam_name} \ | ||
--SORT_ORDER queryname \ | ||
--MAX_RECORDS_IN_RAM 1000000 | ||
} | ||
runtime { | ||
docker: docker | ||
disks: "local-disk " + disk_size + " HDD" | ||
memory: "3500 MB" | ||
preemptible: 3 | ||
} | ||
output { | ||
File sorted_bam = "${sorted_bam_name}" | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
## Copyright Broad Institute, 2018 | ||
## | ||
## This WDL converts paired FASTQ to uBAM and adds read group information | ||
## | ||
## Requirements/expectations : | ||
## - Pair-end sequencing data in FASTQ format (one file per orientation) | ||
## - One or more read groups, one per pair of FASTQ files | ||
## - A readgroup.list file with the following format : | ||
## ``readgroup fastq_pair1 fastq_pair2 sample_name library_name platform_unit run_date platform_name sequecing_center`` | ||
## | ||
## Outputs : | ||
## - Set of unmapped BAMs, one per read group | ||
## - File of a list of the generated unmapped BAMs | ||
## | ||
## Cromwell version support | ||
## - Successfully tested on v32 | ||
## - Does not work on versions < v23 due to output syntax | ||
## | ||
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
## For program versions, see docker containers. | ||
## | ||
## LICENSING : | ||
## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
## be subject to different licenses. Users are responsible for checking that they are | ||
## authorized to run all programs before running this script. Please see the docker | ||
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
## licensing information pertaining to the included programs. | ||
# WORKFLOW DEFINITION | ||
workflow ConvertPairedFastQsToUnmappedBamWf { | ||
File readgroup_list | ||
Array[Array[String]] readgroup_array = read_tsv(readgroup_list) | ||
String ubam_list_name = basename(readgroup_list,".list") + "unmapped.bam.list" | ||
|
||
String? gatk_docker_override | ||
String gatk_docker = select_first([gatk_docker_override, "us.gcr.io/broad-gatk/gatk:latest"]) | ||
String? gatk_path_override | ||
String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) | ||
Int? preemptible_attempts | ||
|
||
# Convert multiple pairs of input fastqs in parallel | ||
scatter (i in range(length(readgroup_array))) { | ||
|
||
# Convert pair of FASTQs to uBAM | ||
call PairedFastQsToUnmappedBAM { | ||
input: | ||
fastq_1 = readgroup_array[i][1], | ||
fastq_2 = readgroup_array[i][2], | ||
readgroup_name = readgroup_array[i][0], | ||
sample_name = readgroup_array[i][3], | ||
library_name = readgroup_array[i][4], | ||
platform_unit = readgroup_array[i][5], | ||
run_date = readgroup_array[i][6], | ||
platform_name = readgroup_array[i][7], | ||
sequencing_center = readgroup_array[i][8], | ||
gatk_path = gatk_path, | ||
docker = gatk_docker, | ||
preemptible_attempts = preemptible_attempts | ||
} | ||
} | ||
|
||
#Create a file with a list of the generated ubams | ||
call CreateFoFN { | ||
input: | ||
array_of_files = PairedFastQsToUnmappedBAM.output_bam, | ||
fofn_name = ubam_list_name, | ||
docker = gatk_docker | ||
} | ||
|
||
# Outputs that will be retained when execution is complete | ||
output { | ||
Array[File] output_bams = PairedFastQsToUnmappedBAM.output_bam | ||
File unmapped_bam_list = CreateFoFN.fofn_list | ||
} | ||
} | ||
|
||
# TASK DEFINITIONS | ||
# Convert a pair of FASTQs to uBAM | ||
task PairedFastQsToUnmappedBAM { | ||
# Command parameters | ||
String sample_name | ||
File fastq_1 | ||
File fastq_2 | ||
String readgroup_name | ||
String library_name | ||
String platform_unit | ||
String run_date | ||
String platform_name | ||
String sequencing_center | ||
|
||
# Runtime parameters | ||
Int? disk_space_gb | ||
Int? machine_mem_gb | ||
Int? preemptible_attempts | ||
String docker | ||
String gatk_path | ||
|
||
command { | ||
${gatk_path} --java-options "-Xmx3000m" \ | ||
FastqToSam \ | ||
--FASTQ ${fastq_1} \ | ||
--FASTQ2 ${fastq_2} \ | ||
--OUTPUT ${readgroup_name}.unmapped.bam \ | ||
--READ_GROUP_NAME ${readgroup_name} \ | ||
--SAMPLE_NAME ${sample_name} \ | ||
--LIBRARY_NAME ${library_name} \ | ||
--PLATFORM_UNIT ${platform_unit} \ | ||
--RUN_DATE ${run_date} \ | ||
--PLATFORM ${platform_name} \ | ||
--SEQUENCING_CENTER ${sequencing_center} | ||
} | ||
runtime { | ||
docker: docker | ||
memory: select_first([machine_mem_gb, 10]) + " GB" | ||
cpu: "1" | ||
disks: "local-disk " + select_first([disk_space_gb, 100]) + " HDD" | ||
preemptible: select_first([preemptible_attempts, 3]) | ||
} | ||
output { | ||
File output_bam = "${readgroup_name}.unmapped.bam" | ||
} | ||
} | ||
|
||
task CreateFoFN { | ||
# Command parameters | ||
Array[String] array_of_files | ||
String fofn_name | ||
|
||
# Runtime parameters | ||
String docker | ||
|
||
command { | ||
mv ${write_lines(array_of_files)} ${fofn_name}.list | ||
} | ||
output { | ||
File fofn_list = "${fofn_name}.list" | ||
} | ||
runtime { | ||
docker: docker | ||
preemptible: 3 | ||
} | ||
} | ||
|
Oops, something went wrong.