Skip to content

Commit

Permalink
Dev (#9)
Browse files Browse the repository at this point in the history
* added cram-to-bam. updated pairedtoubam to use gatk4

* Update README.md

removed gatk software requiremnt because repo will contain more than one wdl which may use different versions.

* Update README.md

* Wdl now uses a readgroup tsv file as input. Added task to compose a file containing a list of the generated ubams

* minor

* minor

* minor edits

* corrected memory placement

* minor edits

* added bam-to-unmapped-bams wdl

* fixed comment number

* changed to use latest gatk docker

* fastq to bam now uses arrays as input

* updated descriptor for paired fastq to bam

* updated inpute in description

* added a firecloud version for fastq to Ubam

* minor format changes. chaged pairedfastq2Ubam docker to gcr

* Minor update to ReadMe, added default docker to cram2bam

* decreased mem size in cram2bam to reduce cost

* Updated WDL to 1.0, removed GenerateOutputMap from bam2ubam, removed fc version of fastq2ubam, added defaults to cram2bam

* Updated WDL to 1.0, removed GenerateOutputMap from bam2ubam, added defaults to cram2bam

* minor format change to Readme

* replaced '$' with '~'

* simplified workflow to accept one pair of sample at a time

* minor update to cromwell version note

* minor update to pair2ubam in Readme

* added task variables to global workflow inputs

* added commas

* made input variables global
  • Loading branch information
bshifaw committed Dec 20, 2019
1 parent 088149a commit f9bc6bf
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 73 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ This WDL converts paired FASTQ to uBAM and adds read group information
- sequecing_center

#### Outputs
- Set of unmapped BAMs, one per read group
- File containing a list of the generated unmapped BAMs
- Unmapped BAM

### bam-to-unmapped-bams :
This WDL converts BAM to unmapped BAMs
Expand Down
2 changes: 1 addition & 1 deletion bam-to-unmapped-bams.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ version 1.0
## - Sorted Unmapped BAMs
##
## Cromwell version support
## - Successfully tested on v33
## - Successfully tested on v47
## - Does not work on versions < v23 due to output syntax
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
Expand Down
10 changes: 5 additions & 5 deletions cram-to-bam.inputs.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"CramToBamFlow.CramToBamTask.sample_name": "NA12878",
"CramToBamFlow.CramToBamTask.input_cram": "gs://gatk-test-data/wgs_cram/NA12878_20k_hg38/NA12878.cram",
"CramToBamFlow.sample_name": "NA12878",
"CramToBamFlow.input_cram": "gs://gatk-test-data/wgs_cram/NA12878_20k_hg38/NA12878.cram",

"CramToBamFlow.CramToBamTask.ref_dict": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict",
"CramToBamFlow.CramToBamTask.ref_fasta": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta",
"CramToBamFlow.CramToBamTask.ref_fasta_index": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai"
"CramToBamFlow.ref_dict": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict",
"CramToBamFlow.ref_fasta": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta",
"CramToBamFlow.ref_fasta_index": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai"
}
12 changes: 11 additions & 1 deletion cram-to-bam.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ version 1.0
## BGZIP_VER=1.3
## SVTOOLKIT_VER=2.00-1650
## It was tested pulling the HG38 reference Fasta and Fai.
## Successfully tested on Cromwell version 28. Does not work on versions < v23 due to output syntax
## Successfully tested on Cromwell version 47. Does not work on versions < v23 due to output syntax
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
##
## LICENSING : This script is released under the WDL source code license (BSD-3) (see LICENSE in https://github.com/broadinstitute/wdl).
Expand All @@ -27,13 +27,23 @@ version 1.0
#WORKFLOW DEFINITION
workflow CramToBamFlow {
input {
File ref_fasta
File ref_fasta_index
File ref_dict
File input_cram
String sample_name
String gotc_docker = "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"
Int preemptible_tries = 3
}

#converts CRAM to SAM to BAM and makes BAI
call CramToBamTask{
input:
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
input_cram = input_cram,
sample_name = sample_name,
docker_image = gotc_docker,
preemptible_tries = preemptible_tries
}
Expand Down
3 changes: 1 addition & 2 deletions interleaved-fastq-to-paired-fastq.inputs.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
{
"UninterleaveFastqs.uninterleave_fqs.input_fastq": "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_interleaved.fastq"
"UninterleaveFastqs.input_fastq": "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_interleaved.fastq"
}

8 changes: 7 additions & 1 deletion interleaved-fastq-to-paired-fastq.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@ version 1.0
##################

workflow UninterleaveFastqs {
input {
File input_fastq
}

call uninterleave_fqs
call uninterleave_fqs {
input:
input_fastq = input_fastq
}

}

Expand Down
26 changes: 10 additions & 16 deletions paired-fastq-to-unmapped-bam.inputs.json
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
{
"ConvertPairedFastQsToUnmappedBamWf.readgroup_name": ["NA12878_A", "NA12878_B", "NA12878_C"],
"ConvertPairedFastQsToUnmappedBamWf.sample_name": ["NA12878", "NA12878", "NA12878"],
"ConvertPairedFastQsToUnmappedBamWf.fastq_1": [
"gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_1.fastq",
"gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.2.ATCACGAT.20k_reads_1.fastq",
"gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_1.fastq"],
"ConvertPairedFastQsToUnmappedBamWf.fastq_2": [
"gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_2.fastq",
"gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.2.ATCACGAT.20k_reads_2.fastq",
"gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_2.fastq"],
"ConvertPairedFastQsToUnmappedBamWf.library_name": ["Solexa-NA12878", "Solexa-NA12878","Solexa-NA12878"],
"ConvertPairedFastQsToUnmappedBamWf.platform_unit": ["H06HDADXX130110.2.ATCACGAT", "H06HDADXX130110.1.ATCACGAT", "H06JUADXX130110.1.ATCACGAT"],
"ConvertPairedFastQsToUnmappedBamWf.run_date": ["2016-09-01T02:00:00+0200", "2016-09-01T02:00:00+0200", "2016-09-01T02:00:00+0200"],
"ConvertPairedFastQsToUnmappedBamWf.platform_name": ["illumina","illumina","illumina"],
"ConvertPairedFastQsToUnmappedBamWf.sequencing_center": ["BI","BI","BI"],
"ConvertPairedFastQsToUnmappedBamWf.readgroup_name": "NA12878_A",
"ConvertPairedFastQsToUnmappedBamWf.sample_name": "NA12878",
"ConvertPairedFastQsToUnmappedBamWf.fastq_1": "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_1.fastq",
"ConvertPairedFastQsToUnmappedBamWf.fastq_2": "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_2.fastq",
"ConvertPairedFastQsToUnmappedBamWf.library_name": "Solexa-NA12878",
"ConvertPairedFastQsToUnmappedBamWf.platform_unit": "H06HDADXX130110.2.ATCACGAT",
"ConvertPairedFastQsToUnmappedBamWf.run_date": "2016-09-01T02:00:00+0200",
"ConvertPairedFastQsToUnmappedBamWf.platform_name": "illumina",
"ConvertPairedFastQsToUnmappedBamWf.sequencing_center": "BI",

"ConvertPairedFastQsToUnmappedBamWf.ubam_list_name": "NA12878_unmapped_bam"
"ConvertPairedFastQsToUnmappedBamWf.make_fofn": true
}
99 changes: 54 additions & 45 deletions paired-fastq-to-unmapped-bam.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,20 @@ version 1.0
## Requirements/expectations :
## - Pair-end sequencing data in FASTQ format (one file per orientation)
## - The following metada descriptors per sample:
## ```readgroup fastq_pair1_file_path fastq_pair2_file_path sample_name library_name platform_unit run_date platform_name sequecing_center```
## - readgroup
## - sample_name
## - library_name
## - platform_unit
## - run_date
## - platform_name
## - sequecing_center
##
## Outputs :
## - Set of unmapped BAMs, one per read group
## - File of a list of the generated unmapped BAMs
##
## Cromwell version support
## - Successfully tested on v32
## - Successfully tested on v47
## - Does not work on versions < v23 due to output syntax
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
Expand All @@ -30,52 +36,53 @@ version 1.0
# WORKFLOW DEFINITION
workflow ConvertPairedFastQsToUnmappedBamWf {
input {
Array[String] sample_name
Array[String] fastq_1
Array[String] fastq_2
Array[String] readgroup_name
Array[String] library_name
Array[String] platform_unit
Array[String] run_date
Array[String] platform_name
Array[String] sequencing_center
String sample_name
String fastq_1
String fastq_2
String readgroup_name
String library_name
String platform_unit
String run_date
String platform_name
String sequencing_center

String ubam_list_name
Boolean make_fofn = false

String gatk_docker = "broadinstitute/gatk:latest"
String gatk_path = "/gatk/gatk"
}
# Convert multiple pairs of input fastqs in parallel
scatter (i in range(length(readgroup_name))) {

# Convert pair of FASTQs to uBAM
call PairedFastQsToUnmappedBAM {
input:
sample_name = sample_name[i],
fastq_1 = fastq_1[i],
fastq_2 = fastq_2[i],
readgroup_name = readgroup_name[i],
library_name = library_name[i],
platform_unit = platform_unit[i],
run_date = run_date[i],
platform_name = platform_name[i],
sequencing_center = sequencing_center[i],
gatk_path = gatk_path,
docker = gatk_docker
}
}
String ubam_list_name = sample_name

#Create a file with a list of the generated ubams
call CreateFoFN {
# Convert pair of FASTQs to uBAM
call PairedFastQsToUnmappedBAM {
input:
array_of_files = PairedFastQsToUnmappedBAM.output_bam,
fofn_name = ubam_list_name + ".ubam"
sample_name = sample_name,
fastq_1 = fastq_1,
fastq_2 = fastq_2,
readgroup_name = readgroup_name,
library_name = library_name,
platform_unit = platform_unit,
run_date = run_date,
platform_name = platform_name,
sequencing_center = sequencing_center,
gatk_path = gatk_path,
docker = gatk_docker
}

#Create a file with the generated ubam
if (make_fofn) {
call CreateFoFN {
input:
ubam = PairedFastQsToUnmappedBAM.output_unmapped_bam,
fofn_name = ubam_list_name + ".ubam"
}
}

# Outputs that will be retained when execution is complete
output {
Array[File] output_bams = PairedFastQsToUnmappedBAM.output_bam
File unmapped_bam_list = CreateFoFN.fofn_list
File output_unmapped_bam = PairedFastQsToUnmappedBAM.output_unmapped_bam
File? unmapped_bam_list = CreateFoFN.fofn_list
}
}

Expand All @@ -97,14 +104,15 @@ task PairedFastQsToUnmappedBAM {
String gatk_path

# Runtime parameters
Int disk_space_gb = 100
Int machine_mem_gb = 10
Int addtional_disk_space_gb = 10
Int machine_mem_gb = 7
Int preemptible_attempts = 3
String docker
}

Int command_mem_gb = machine_mem_gb - 1
Int disk_space_gb = ceil((size(fastq_1, "GB") + size(fastq_2, "GB")) * 2 ) + addtional_disk_space_gb
command {
~{gatk_path} --java-options "-Xmx3000m" \
~{gatk_path} --java-options "-Xmx~{command_mem_gb}g" \
FastqToSam \
--FASTQ ~{fastq_1} \
--FASTQ2 ~{fastq_2} \
Expand All @@ -120,24 +128,25 @@ task PairedFastQsToUnmappedBAM {
runtime {
docker: docker
memory: machine_mem_gb + " GB"
cpu: "1"
disks: "local-disk " + disk_space_gb + " HDD"
preemptible: preemptible_attempts
}
output {
File output_bam = "~{readgroup_name}.unmapped.bam"
File output_unmapped_bam = "~{readgroup_name}.unmapped.bam"
}
}

# Creats a file of file names of the uBAMs, which is a text file with each row having the path to the file.
# Creats a file of file names of the uBAM, which is a text file with each row having the path to the file.
# In this case there will only be one file path in the txt file but this format is used by
# the pre-processing for variant discvoery workflow.
task CreateFoFN {
input {
# Command parameters
Array[String] array_of_files
String ubam
String fofn_name
}
command {
mv ~{write_lines(array_of_files)} ~{fofn_name}.list
echo ~{ubam} > ~{fofn_name}.list
}
output {
File fofn_list = "~{fofn_name}.list"
Expand Down

0 comments on commit f9bc6bf

Please sign in to comment.