diff --git a/README.md b/README.md index 3dec02e..9f5f186 100644 --- a/README.md +++ b/README.md @@ -83,3 +83,7 @@ Cromwell version support override:NCBI_Build=37,Strand=+,status=Somatic,phase=Phase_I,sequencer=Illumina,Tumor_Validation_Allele1=,Tumor_Validation_Allele2=,Match_Norm_Validation_Allele1=,Match_Norm_Validation_Allele2=,Verification_Status=,Validation_Status=,Validation_Method=,Score=,BAM_file=,Match_Norm_Seq_Allele1=,Match_Norm_Seq_Allele2= ``` +### Important Note : +- Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +- For help running workflows on the Google Cloud Platform or locally please +view the following tutorial [(How to) Execute Workflows from the gatk-workflows Git Organization](https://software.broadinstitute.org/gatk/documentation/article?id=12521) diff --git a/mutect2-normal-normal.inputs.json b/mutect2-normal-normal.inputs.json index 24ac267..9c63de9 100644 --- a/mutect2-normal-normal.inputs.json +++ b/mutect2-normal-normal.inputs.json @@ -34,6 +34,6 @@ "Mutect2NormalNormal.scatter_count": "10", "##_COMMENT4": "Docker", - "Mutect2NormalNormal.gatk_docker": "broadinstitute/gatk:4.0.5.0" + "Mutect2NormalNormal.gatk_docker": "broadinstitute/gatk:4.0.8.1" } diff --git a/mutect2.exome.inputs.json b/mutect2.exome.inputs.json index 22cca5d..7a39699 100644 --- a/mutect2.exome.inputs.json +++ b/mutect2.exome.inputs.json @@ -1,7 +1,7 @@ { "##_COMMENT1": "Runtime", "##Mutect2.oncotator_docker": "(optional) String?", - "Mutect2.gatk_docker": "broadinstitute/gatk:4.0.5.0", + "Mutect2.gatk_docker": "broadinstitute/gatk:4.0.8.1", "##_COMMENT2": "Workflow options", "Mutect2.intervals": "gs://gatk-best-practices/somatic-b37/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list", diff --git a/mutect2.wdl b/mutect2.wdl index b342c7a..8467258 100644 --- a/mutect2.wdl +++ b/mutect2.wdl @@ -11,6 +11,7 @@ ## ** Runtime ** ## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator ## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) +## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors ## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar ## in the docker image. This must be supplied when running in an environment that does not support docker ## (e.g. SGE cluster on a Broad on-prem VM) @@ -21,7 +22,8 @@ ## artifact_modes: types of artifacts to consider in the orientation bias filter (optional) ## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) ## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) -## run_orientation_bias_filter: if true, run the orientation bias filter post-processing step (optional, false by default) +## run_orientation_bias_filter: (deprecated) if true, run the orientation bias filter (optional) +## run_orientation_bias_mixture_model_filter: if true, filter orientation bias sites based on the posterior probabilities computed by the read orientation artifact mixture model (optional) ## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a ## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on ## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image. @@ -43,6 +45,14 @@ ## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true ## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. ## +## Funcotator parameters (see Funcotator help for more details). +## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19" +## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process. +## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT +## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud. +## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as :. For example: "Center:Broad" +## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as :. For example: "Center:Broad" +## ## Outputs : ## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam ## file of reassembled reads if requested @@ -77,7 +87,10 @@ workflow Mutect2 { File? realignment_index_bundle String? realignment_extra_args Boolean? run_orientation_bias_filter - Boolean run_ob_filter = select_first([run_orientation_bias_filter, false]) + Boolean run_ob_filter = select_first([run_orientation_bias_filter, false]) && (length(select_first([artifact_modes, ["G/T", "C/T"]])) > 0) + Boolean? run_orientation_bias_mixture_model_filter + Boolean run_ob_mm_filter = select_first([run_orientation_bias_mixture_model_filter, false]) + File? ob_mm_filter_training_intervals Array[String]? artifact_modes File? tumor_sequencing_artifact_metrics String? m2_extra_args @@ -102,12 +115,12 @@ workflow Mutect2 { # funcotator inputs Boolean? run_funcotator Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - String? reference_version - String? data_sources_tar_gz - String? transcript_selection_mode - Array[String]? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides + String? funco_reference_version + File? funco_data_sources_tar_gz + String? funco_transcript_selection_mode + File? funco_transcript_selection_list + Array[String]? funco_annotation_defaults + Array[String]? funco_annotation_overrides File? gatk_override @@ -121,8 +134,10 @@ workflow Mutect2 { Boolean? filter_funcotations Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) String? oncotator_extra_args + String? funcotator_extra_args Int? preemptible_attempts + Int? max_retries # Use as a last resort to increase the disk given to every task in case of ill behaving data Int? emergency_extra_disk @@ -135,6 +150,7 @@ workflow Mutect2 { # If no tar is provided, the task downloads one from broads ftp server Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100 + Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100 Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 # This is added to every task as padding, should increase if systematically you need more disk for every call @@ -166,6 +182,7 @@ workflow Mutect2 { gatk_override = gatk_override, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad } @@ -186,8 +203,10 @@ workflow Mutect2 { gnomad = gnomad, gnomad_index = gnomad_index, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, m2_extra_args = m2_extra_args, make_bamout = make_bamout_or_default, + artifact_prior_table = LearnReadOrientationModel.artifact_prior_table, compress = compress, gga_vcf = gga_vcf, gga_vcf_idx = gga_vcf_idx, @@ -203,7 +222,8 @@ workflow Mutect2 { call SumFloats as SumSubVcfs { input: sizes = sub_vcf_size, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + max_retries = max_retries } call MergeVCFs { @@ -215,6 +235,7 @@ workflow Mutect2 { gatk_override = gatk_override, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad } @@ -222,7 +243,8 @@ workflow Mutect2 { call SumFloats as SumSubBamouts { input: sizes = sub_bamout_size, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + max_retries = max_retries } call MergeBamOuts { @@ -234,7 +256,8 @@ workflow Mutect2 { output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), gatk_override = gatk_override, gatk_docker = gatk_docker, - disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad + disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad, + max_retries = max_retries } } @@ -245,6 +268,7 @@ workflow Mutect2 { ref_fasta = ref_fasta, ref_fai = ref_fai, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, tumor_bam = tumor_bam, tumor_bai = tumor_bai, gatk_override = gatk_override, @@ -252,6 +276,35 @@ workflow Mutect2 { } } + if (run_ob_mm_filter) { + call CollectF1R2Counts { + input: + gatk_docker = gatk_docker, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + preemptible_attempts = preemptible_attempts, + tumor_bam = tumor_bam, + tumor_bai = tumor_bai, + gatk_override = gatk_override, + disk_space = tumor_bam_size + ref_size + disk_pad, + intervals = if defined(ob_mm_filter_training_intervals) then ob_mm_filter_training_intervals else intervals, + max_retries = max_retries + } + + call LearnReadOrientationModel { + input: + alt_table = CollectF1R2Counts.alt_table, + ref_histogram = CollectF1R2Counts.ref_histogram, + alt_histograms = CollectF1R2Counts.alt_histograms, + tumor_sample = CollectF1R2Counts.tumor_sample, + gatk_override = gatk_override, + gatk_docker = gatk_docker, + preemptible_attempts = preemptible_attempts, + max_retries = max_retries + } + } + if (defined(variants_for_contamination)) { call CalculateContamination { input: @@ -261,6 +314,7 @@ workflow Mutect2 { ref_fai = ref_fai, ref_dict = ref_dict, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, gatk_docker = gatk_docker, tumor_bam = tumor_bam, tumor_bai = tumor_bai, @@ -282,6 +336,7 @@ workflow Mutect2 { output_name = filtered_name, compress = compress, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, contamination_table = CalculateContamination.contamination_table, maf_segments = CalculateContamination.maf_segments, m2_extra_filtering_args = m2_extra_filtering_args, @@ -301,6 +356,7 @@ workflow Mutect2 { compress = compress, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, pre_adapter_metrics = input_artifact_metrics, artifact_modes = artifact_modes, disk_space = ceil(size(Filter.filtered_vcf, "GB") * small_input_to_output_multiplier) + ceil(size(input_artifact_metrics, "GB")) + disk_pad @@ -318,6 +374,7 @@ workflow Mutect2 { realignment_index_bundle = select_first([realignment_index_bundle]), realignment_extra_args = realignment_extra_args, gatk_docker = gatk_docker, + max_retries = max_retries, compress = compress, output_name = filtered_name, input_vcf = realignment_filter_input, @@ -339,6 +396,7 @@ workflow Mutect2 { control_id = M2.normal_sample[0], oncotator_docker = oncotator_docker_or_default, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad, filter_maf = filter_oncotator_maf_or_default, oncotator_extra_args = oncotator_extra_args @@ -348,37 +406,38 @@ workflow Mutect2 { if (run_funcotator_or_default) { File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) - call Funcotate { + call FuncotateMaf { input: - m2_vcf = funcotate_vcf_input, - m2_vcf_index = funcotate_vcf_input_index, + input_vcf = funcotate_vcf_input, + input_vcf_idx = funcotate_vcf_input_index, ref_fasta = ref_fasta, - ref_fai = ref_fai, + ref_fasta_index = ref_fai, ref_dict = ref_dict, - reference_version = select_first([reference_version, "NO_REFERENCE_VERSION_GIVEN"]), - output_name = funcotated_name, - compress = compress, - data_sources_tar_gz = data_sources_tar_gz, - transcript_selection_mode = transcript_selection_mode, - transcript_selection_list = transcript_selection_list, - annotation_defaults = annotation_defaults, - annotation_overrides = annotation_overrides, + reference_version = select_first([funco_reference_version, "hg19"]), + data_sources_tar_gz = funco_data_sources_tar_gz, + case_id = M2.tumor_sample[0], + control_id = M2.normal_sample[0], + transcript_selection_mode = funco_transcript_selection_mode, + transcript_selection_list = funco_transcript_selection_list, + annotation_defaults = funco_annotation_defaults, + annotation_overrides = funco_annotation_overrides, gatk_docker = gatk_docker, gatk_override = gatk_override, - filter_funcotations = filter_funcotations_or_default + filter_funcotations = filter_funcotations_or_default, + sequencing_center = sequencing_center, + sequence_source = sequence_source, + disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad, + max_retries = max_retries, + extra_args = funcotator_extra_args } } output { - File unfiltered_vcf = MergeVCFs.merged_vcf - File unfiltered_vcf_index = MergeVCFs.merged_vcf_index File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) File filtered_vcf_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) File? contamination_table = CalculateContamination.contamination_table - File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf - File? funcotated_vcf = Funcotate.funcotated_vcf - File? funcotated_vcf_index = Funcotate.funcotated_vcf_index + File? funcotated_maf = FuncotateMaf.funcotated_output File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics File? bamout = MergeBamOuts.merged_bam_out File? bamout_index = MergeBamOuts.merged_bam_out_index @@ -401,6 +460,7 @@ task SplitIntervals { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -429,6 +489,7 @@ task SplitIntervals { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -456,6 +517,7 @@ task M2 { Boolean compress File? gga_vcf File? gga_vcf_idx + File? artifact_prior_table String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" @@ -466,6 +528,7 @@ task M2 { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -502,6 +565,7 @@ task M2 { ${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \ -O "${output_vcf}" \ ${true='--bam-output bamout.bam' false='' make_bamout} \ + ${"--orientation-bias-artifact-priors " + artifact_prior_table} \ ${m2_extra_args} >>> @@ -511,6 +575,7 @@ task M2 { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } @@ -538,6 +603,7 @@ task MergeVCFs { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -560,6 +626,7 @@ task MergeVCFs { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } @@ -583,6 +650,7 @@ task MergeBamOuts { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -614,6 +682,7 @@ task MergeBamOuts { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } @@ -623,6 +692,7 @@ task MergeBamOuts { } } +# This task is deprecated and is no longer supported task CollectSequencingArtifactMetrics { # inputs File ref_fasta @@ -636,6 +706,7 @@ task CollectSequencingArtifactMetrics { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -657,6 +728,7 @@ task CollectSequencingArtifactMetrics { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } @@ -665,6 +737,114 @@ task CollectSequencingArtifactMetrics { } } +task CollectF1R2Counts { + # input + File ref_fasta + File ref_fai + File ref_dict + File tumor_bam + File tumor_bai + + File? gatk_override + File? intervals + + # runtime + Int? max_retries + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? disk_space + Int? cpu + Boolean use_ssd = false + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem * 1000 else 7000 + Int command_mem = machine_mem - 1000 + + command { + set -e + export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + + # Get the sample name. The task M2 retrieves this information too, but it must be done separately here + # to avoid a cyclic dependency + gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode + tumor_name=$(head -n 1 tumor_name.txt) + + gatk --java-options "-Xmx${command_mem}m" CollectF1R2Counts \ + -I ${tumor_bam} -R ${ref_fasta} \ + ${"-L " + intervals} \ + -alt-table "$tumor_name-alt.tsv" \ + -ref-hist "$tumor_name-ref.metrics" \ + -alt-hist "$tumor_name-alt-depth1.metrics" + } + + runtime { + docker: gatk_docker + bootDiskSizeGb: 12 + memory: machine_mem + " MB" + disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) + cpu: select_first([cpu, 1]) + } + + output { + File alt_table = glob("*-alt.tsv")[0] + File ref_histogram = glob("*-ref.metrics")[0] + File alt_histograms = glob("*-alt-depth1.metrics")[0] + String tumor_sample = read_string("tumor_name.txt") + } +} + +task LearnReadOrientationModel { + File alt_table + File ref_histogram + File? alt_histograms + + File? gatk_override + File? intervals + String tumor_sample + + # runtime + Int? max_retries + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? disk_space + Int? cpu + Boolean use_ssd = false + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem * 1000 else 8000 + Int command_mem = machine_mem - 1000 + + command { + set -e + export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \ + -alt-table ${alt_table} \ + -ref-hist ${ref_histogram} \ + -alt-hist ${alt_histograms} \ + -O "${tumor_sample}-artifact-prior-table.tsv" + } + + runtime { + docker: gatk_docker + bootDiskSizeGb: 12 + memory: machine_mem + " MB" + disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) + cpu: select_first([cpu, 1]) + } + + output { + File artifact_prior_table = "${tumor_sample}-artifact-prior-table.tsv" + } + +} + task CalculateContamination { # inputs File? intervals @@ -682,12 +862,13 @@ task CalculateContamination { # runtime Int? preemptible_attempts + Int? max_retries String gatk_docker Int? disk_space Int? mem # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 + Int machine_mem = if defined(mem) then mem * 1000 else 3000 Int command_mem = machine_mem - 500 command { @@ -696,11 +877,13 @@ task CalculateContamination { export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} if [[ -f "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O normal_pileups.table + gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal_pileups.table NORMAL_CMD="-matched normal_pileups.table" fi - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O pileups.table + gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table --tumor-segmentation segments.table $NORMAL_CMD } @@ -710,6 +893,7 @@ task CalculateContamination { memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) } output { @@ -738,6 +922,7 @@ task Filter { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -764,6 +949,7 @@ task Filter { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } @@ -785,8 +971,12 @@ task FilterByOrientationBias { File pre_adapter_metrics Array[String]? artifact_modes + # If artifact modes is passed in to the task as [], this task will fail. + Array[String] final_artifact_modes = select_first([artifact_modes, ["G/T", "C/T"]]) + # runtime Int? preemptible_attempts + Int? max_retries String gatk_docker Int? disk_space Int? mem @@ -804,7 +994,7 @@ task FilterByOrientationBias { gatk --java-options "-Xmx${command_mem}m" FilterByOrientationBias \ -V ${input_vcf} \ - -AM ${sep=" -AM " artifact_modes} \ + -AM ${sep=" -AM " final_artifact_modes} \ -P ${pre_adapter_metrics} \ -O ${output_vcf} } @@ -815,6 +1005,7 @@ task FilterByOrientationBias { memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } @@ -842,6 +1033,7 @@ task FilterAlignmentArtifacts { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -869,6 +1061,7 @@ task FilterAlignmentArtifacts { memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } @@ -895,6 +1088,7 @@ task oncotate_m2 { String oncotator_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -945,6 +1139,7 @@ task oncotate_m2 { bootDiskSizeGb: 12 disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) cpu: select_first([cpu, 1]) } @@ -959,6 +1154,7 @@ task SumFloats { # Runtime parameters Int? preemptible_attempts + Int? max_retries command <<< python -c "print ${sep="+" sizes}" @@ -972,101 +1168,116 @@ task SumFloats { docker: "python:2.7" disks: "local-disk " + 10 + " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 3]) } } -task Funcotate { - # inputs - File ref_fasta - File ref_fai - File ref_dict - File m2_vcf - File m2_vcf_index - String reference_version - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - - File? data_sources_tar_gz - String? transcript_selection_mode - Array[String]? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - Boolean filter_funcotations - - # ============== - # Process input args: - String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else "" - String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" - String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" - String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else "" - # ============== - - # runtime - - String gatk_docker - File? gatk_override - Int? mem - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - DATA_SOURCES_TAR_GZ=${data_sources_tar_gz} - if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then - # We have to download the data sources: - echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ" - echo "Downloading default data sources..." - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/funcotator_dataSources.v1.0.20180105.tar.gz - tar -zxf funcotator_dataSources.v1.0.20180105.tar.gz - DATA_SOURCES_FOLDER=funcotator_dataSources.v1.0.20180105 - else - # Extract the tar.gz: - mkdir datasources_dir - tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 - DATA_SOURCES_FOLDER="$PWD/datasources_dir" - fi - - gatk --java-options "-Xmx${command_mem}m" Funcotator \ - --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ${reference_version} \ - -R ${ref_fasta} \ - -V ${m2_vcf} \ - -O ${output_vcf} \ - ${"--transcript-selection-mode " + transcript_selection_mode} \ - ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \ - ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ - ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \ - ${filter_funcotations_args} - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - } - - output { - File funcotated_vcf = "${output_vcf}" - File funcotated_vcf_index = "${output_vcf_index}" - } -} - +task FuncotateMaf { + # inputs + File ref_fasta + File ref_fasta_index + File ref_dict + File input_vcf + File input_vcf_idx + String reference_version + String output_format = "MAF" + String? sequencing_center + String? sequence_source + String case_id + String? control_id + + File? data_sources_tar_gz + String? transcript_selection_mode + File? transcript_selection_list + Array[String]? annotation_defaults + Array[String]? annotation_overrides + Boolean filter_funcotations + File? interval_list + + String? extra_args + + # ============== + # Process input args: + String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" + String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" + String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else "" + String final_output_filename = basename(input_vcf, ".vcf") + ".maf.annotated" + # ============== + + # runtime + + String gatk_docker + File? gatk_override + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space_gb + Int? cpu + + Boolean use_ssd = false + + # This should be updated when a new version of the data sources is released + String default_datasources_version = "funcotator_dataSources.v1.4.20180615" + + # You may have to change the following two parameter values depending on the task requirements + Int default_ram_mb = 3000 + # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). + Int default_disk_space_gb = 100 + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb + Int command_mem = machine_mem - 1000 + + command <<< + set -e + export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + + DATA_SOURCES_TAR_GZ=${data_sources_tar_gz} + if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then + # We have to download the data sources: + echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ" + echo "Downloading default data sources..." + wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/${default_datasources_version}.tar.gz + tar -zxf ${default_datasources_version}.tar.gz + DATA_SOURCES_FOLDER=${default_datasources_version} + else + # Extract the tar.gz: + mkdir datasources_dir + tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 + DATA_SOURCES_FOLDER="$PWD/datasources_dir" + fi + + gatk --java-options "-Xmx${command_mem}m" Funcotator \ + --data-sources-path $DATA_SOURCES_FOLDER \ + --ref-version ${reference_version} \ + --output-file-format ${output_format} \ + -R ${ref_fasta} \ + -V ${input_vcf} \ + -O ${final_output_filename} \ + ${"-L " + interval_list} \ + ${"--transcript-selection-mode " + transcript_selection_mode} \ + ${"--transcript-list " + transcript_selection_list} \ + --annotation-default normal_barcode:${control_id} \ + --annotation-default tumor_barcode:${case_id} \ + --annotation-default Center:${default="Unknown" sequencing_center} \ + --annotation-default source:${default="Unknown" sequence_source} \ + ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ + ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \ + ${filter_funcotations_args} \ + ${extra_args} + >>> + + runtime { + docker: gatk_docker + bootDiskSizeGb: 20 + memory: machine_mem + " MB" + disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 3]) + maxRetries: select_first([max_retries, 3]) + cpu: select_first([cpu, 1]) + } + + output { + File funcotated_output = "${final_output_filename}" + } + } diff --git a/mutect2_nio.wdl b/mutect2_nio.wdl index 9823f47..222c82f 100644 --- a/mutect2_nio.wdl +++ b/mutect2_nio.wdl @@ -15,6 +15,7 @@ ## ** Runtime ** ## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator ## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) +## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors ## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar ## in the docker image. This must be supplied when running in an environment that does not support docker ## (e.g. SGE cluster on a Broad on-prem VM) @@ -25,7 +26,7 @@ ## artifact_modes: types of artifacts to consider in the orientation bias filter (optional) ## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) ## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) -## run_orientation_bias_filter: if true, run the orientation bias filter post-processing step (optional, false by default) +## run_orientation_bias_filter: if true, run the orientation bias filter post-processing step (optional, true by default) ## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a ## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on ## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image. @@ -47,6 +48,14 @@ ## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true ## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. ## +## Funcotator parameters (see Funcotator help for more details). +## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19" +## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process. +## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT +## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud. +## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as :. For example: "Center:Broad" +## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as :. For example: "Center:Broad" +## ## Outputs : ## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam ## file of reassembled reads if requested @@ -78,8 +87,8 @@ workflow Mutect2 { File? realignment_index_bundle String? realignment_extra_args Boolean? run_orientation_bias_filter - Boolean run_ob_filter = select_first([run_orientation_bias_filter, false]) Array[String]? artifact_modes + Boolean run_ob_filter = select_first([run_orientation_bias_filter, true]) && (length(select_first([artifact_modes, ["G/T", "C/T"]])) > 0) File? tumor_sequencing_artifact_metrics String? m2_extra_args String? m2_extra_filtering_args @@ -102,12 +111,12 @@ workflow Mutect2 { # funcotator inputs Boolean? run_funcotator Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - String? reference_version - String? data_sources_tar_gz - String? transcript_selection_mode - Array[String]? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides + String? funco_reference_version + File? funco_data_sources_tar_gz + String? funco_transcript_selection_mode + File? funco_transcript_selection_list + Array[String]? funco_annotation_defaults + Array[String]? funco_annotation_overrides File? gatk_override @@ -118,9 +127,13 @@ workflow Mutect2 { String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"]) Boolean? filter_oncotator_maf Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true]) + Boolean? filter_funcotations + Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) String? oncotator_extra_args + String? funcotator_extra_args Int? preemptible_attempts + Int? max_retries # Use as a last resort to increase the disk given to every task in case of ill behaving data Int? emergency_extra_disk @@ -133,6 +146,7 @@ workflow Mutect2 { # If no tar is provided, the task downloads one from broads ftp server Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100 + Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100 Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 # This is added to every task as padding, should increase if systematically you need more disk for every call @@ -167,6 +181,7 @@ workflow Mutect2 { gatk_override = gatk_override, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad } @@ -180,6 +195,7 @@ workflow Mutect2 { pon = pon, gnomad = gnomad, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, m2_extra_args = m2_extra_args, make_bamout = make_bamout_or_default, compress = compress, @@ -196,7 +212,8 @@ workflow Mutect2 { call SumFloats as SumSubVcfs { input: sizes = sub_vcf_size, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + max_retries = max_retries } call MergeVCFs { @@ -208,6 +225,7 @@ workflow Mutect2 { gatk_override = gatk_override, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad } @@ -215,7 +233,8 @@ workflow Mutect2 { call SumFloats as SumSubBamouts { input: sizes = sub_bamout_size, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + max_retries = max_retries } call MergeBamOuts { @@ -227,7 +246,8 @@ workflow Mutect2 { output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), gatk_override = gatk_override, gatk_docker = gatk_docker, - disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad + disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad, + max_retries = max_retries } } @@ -238,6 +258,7 @@ workflow Mutect2 { ref_fasta = ref_fasta, ref_fai = ref_fai, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, tumor_bam = tumor_bam, tumor_bai = tumor_bai, gatk_override = gatk_override, @@ -252,6 +273,7 @@ workflow Mutect2 { intervals = intervals, ref_fasta = ref_fasta, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, gatk_docker = gatk_docker, tumor_bam = tumor_bam, normal_bam = normal_bam, @@ -269,6 +291,7 @@ workflow Mutect2 { output_name = filtered_name, compress = compress, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, contamination_table = CalculateContamination.contamination_table, maf_segments = CalculateContamination.maf_segments, m2_extra_filtering_args = m2_extra_filtering_args, @@ -287,6 +310,7 @@ workflow Mutect2 { compress = compress, gatk_docker = gatk_docker, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, pre_adapter_metrics = input_artifact_metrics, artifact_modes = artifact_modes, disk_space = ceil(size(Filter.filtered_vcf, "GB") * small_input_to_output_multiplier) + ceil(size(input_artifact_metrics, "GB")) + disk_pad @@ -302,6 +326,7 @@ workflow Mutect2 { realignment_index_bundle = select_first([realignment_index_bundle]), realignment_extra_args = realignment_extra_args, gatk_docker = gatk_docker, + max_retries = max_retries, compress = compress, output_name = filtered_name, input_vcf = realignment_filter_input @@ -322,6 +347,7 @@ workflow Mutect2 { control_id = M2.normal_sample[0], oncotator_docker = oncotator_docker_or_default, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad, filter_maf = filter_oncotator_maf_or_default, oncotator_extra_args = oncotator_extra_args @@ -329,36 +355,39 @@ workflow Mutect2 { } if (run_funcotator_or_default) { - File funcotate_vcf_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) - File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) - call Funcotate { + File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) + File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) + call FuncotateMaf { input: - m2_vcf = funcotate_vcf_input, - m2_vcf_index = funcotate_vcf_input_index, + input_vcf = funcotate_vcf_input, + input_vcf_idx = funcotate_vcf_input_index, ref_fasta = ref_fasta, - reference_version = select_first([reference_version, "NO_REFERENCE_VERSION_GIVEN"]), - output_name = funcotated_name, - compress = compress, - data_sources_tar_gz = data_sources_tar_gz, - transcript_selection_mode = transcript_selection_mode, - transcript_selection_list = transcript_selection_list, - annotation_defaults = annotation_defaults, - annotation_overrides = annotation_overrides, + reference_version = select_first([funco_reference_version, "hg19"]), + data_sources_tar_gz = funco_data_sources_tar_gz, + case_id = M2.tumor_sample[0], + control_id = M2.normal_sample[0], + transcript_selection_mode = funco_transcript_selection_mode, + transcript_selection_list = funco_transcript_selection_list, + annotation_defaults = funco_annotation_defaults, + annotation_overrides = funco_annotation_overrides, gatk_docker = gatk_docker, - gatk_override = gatk_override + gatk_override = gatk_override, + filter_funcotations = filter_funcotations_or_default, + sequencing_center = sequencing_center, + sequence_source = sequence_source, + disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + funco_tar_size + disk_pad, + max_retries = max_retries, + extra_args = funcotator_extra_args } } output { - File unfiltered_vcf = MergeVCFs.merged_vcf - File unfiltered_vcf_index = MergeVCFs.merged_vcf_index File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) File filtered_vcf_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) File? contamination_table = CalculateContamination.contamination_table File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf - File? funcotated_vcf = Funcotate.funcotated_vcf - File? funcotated_vcf_index = Funcotate.funcotated_vcf_index + File? funcotated_maf = FuncotateMaf.funcotated_output File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics File? bamout = MergeBamOuts.merged_bam_out File? bamout_index = MergeBamOuts.merged_bam_out_index @@ -381,6 +410,7 @@ task SplitIntervals { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -409,6 +439,7 @@ task SplitIntervals { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -440,6 +471,7 @@ task M2 { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -485,6 +517,7 @@ task M2 { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -512,6 +545,7 @@ task MergeVCFs { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -534,6 +568,7 @@ task MergeVCFs { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -557,6 +592,7 @@ task MergeBamOuts { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -588,6 +624,7 @@ task MergeBamOuts { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -610,6 +647,7 @@ task CollectSequencingArtifactMetrics { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -631,6 +669,7 @@ task CollectSequencingArtifactMetrics { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -651,12 +690,13 @@ task CalculateContamination { # runtime Int? preemptible_attempts + Int? max_retries String gatk_docker Int? disk_space Int? mem # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 + Int machine_mem = if defined(mem) then mem * 1000 else 3000 Int command_mem = machine_mem - 500 command { @@ -665,11 +705,13 @@ task CalculateContamination { export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} if [[ -f "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O normal_pileups.table + gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal_pileups.table NORMAL_CMD="-matched normal_pileups.table" fi - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O pileups.table + gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table --tumor-segmentation segments.table $NORMAL_CMD } @@ -679,6 +721,7 @@ task CalculateContamination { memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) } output { @@ -706,6 +749,7 @@ task Filter { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -732,6 +776,7 @@ task Filter { memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -752,8 +797,12 @@ task FilterByOrientationBias { File pre_adapter_metrics Array[String]? artifact_modes + # If artifact modes is passed in to the task as [], this task will fail. + Array[String] final_artifact_modes = select_first([artifact_modes, ["G/T", "C/T"]]) + # runtime Int? preemptible_attempts + Int? max_retries String gatk_docker Int? disk_space Int? mem @@ -771,7 +820,7 @@ task FilterByOrientationBias { gatk --java-options "-Xmx${command_mem}m" FilterByOrientationBias \ -V ${input_vcf} \ - -AM ${sep=" -AM " artifact_modes} \ + -AM ${sep=" -AM " final_artifact_modes} \ -P ${pre_adapter_metrics} \ -O ${output_vcf} } @@ -782,6 +831,7 @@ task FilterByOrientationBias { memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -807,6 +857,7 @@ task FilterAlignmentArtifacts { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -834,6 +885,7 @@ task FilterAlignmentArtifacts { memory: command_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -860,6 +912,7 @@ task oncotate_m2 { String oncotator_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -910,6 +963,7 @@ task oncotate_m2 { bootDiskSizeGb: 12 disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } @@ -924,6 +978,7 @@ task SumFloats { # Runtime parameters Int? preemptible_attempts + Int? max_retries command <<< python -c "print ${sep="+" sizes}" @@ -937,96 +992,114 @@ task SumFloats { docker: "python:2.7" disks: "local-disk " + 10 + " HDD" preemptible: select_first([preemptible_attempts, 10]) + maxRetries: select_first([max_retries, 0]) } } -task Funcotate { - # inputs - String ref_fasta - String m2_vcf - String m2_vcf_index - String reference_version - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - - File? data_sources_tar_gz - String? transcript_selection_mode - Array[String]? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - - # ============== - # Process input args: - String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else "" - String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" - String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" - # ============== - - # runtime - - String gatk_docker - File? gatk_override - Int? mem - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - DATA_SOURCES_TAR_GZ=${data_sources_tar_gz} - if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then - # We have to download the data sources: - echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ" - echo "Downloading default data sources..." - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/funcotator_dataSources.v1.0.20180105.tar.gz - tar -zxf funcotator_dataSources.v1.0.20180105.tar.gz - DATA_SOURCES_FOLDER=funcotator_dataSources.v1.0.20180105 - else - # Extract the tar.gz: - mkdir datasources_dir - tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 - DATA_SOURCES_FOLDER="$PWD/datasources_dir" - fi - - gatk --java-options "-Xmx${command_mem}m" Funcotator \ - --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ${reference_version} \ - -R ${ref_fasta} \ - -V ${m2_vcf} \ - -O ${output_vcf} \ - ${"--transcript-selection-mode " + transcript_selection_mode} \ - ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \ - ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ - ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - } - - output { - File funcotated_vcf = "${output_vcf}" - File funcotated_vcf_index = "${output_vcf_index}" - } -} - +task FuncotateMaf { + # inputs + String ref_fasta + String input_vcf + String input_vcf_idx + String reference_version + String output_format = "MAF" + String? sequencing_center + String? sequence_source + String case_id + String? control_id + + File? data_sources_tar_gz + String? transcript_selection_mode + File? transcript_selection_list + Array[String]? annotation_defaults + Array[String]? annotation_overrides + Boolean filter_funcotations + File? interval_list + + String? extra_args + + # ============== + # Process input args: + String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" + String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" + String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else "" + String final_output_filename = basename(input_vcf, ".vcf") + ".maf.annotated" + # ============== + + # runtime + + String gatk_docker + File? gatk_override + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space_gb + Int? cpu + + Boolean use_ssd = false + + # This should be updated when a new version of the data sources is released + String default_datasources_version = "funcotator_dataSources.v1.4.20180615" + + # You may have to change the following two parameter values depending on the task requirements + Int default_ram_mb = 3000 + # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). + Int default_disk_space_gb = 100 + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb + Int command_mem = machine_mem - 1000 + + command <<< + set -e + export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + + DATA_SOURCES_TAR_GZ=${data_sources_tar_gz} + if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then + # We have to download the data sources: + echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ" + echo "Downloading default data sources..." + wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/${default_datasources_version}.tar.gz + tar -zxf ${default_datasources_version}.tar.gz + DATA_SOURCES_FOLDER=${default_datasources_version} + else + # Extract the tar.gz: + mkdir datasources_dir + tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 + DATA_SOURCES_FOLDER="$PWD/datasources_dir" + fi + + gatk --java-options "-Xmx${command_mem}m" Funcotator \ + --data-sources-path $DATA_SOURCES_FOLDER \ + --ref-version ${reference_version} \ + --output-file-format ${output_format} \ + -R ${ref_fasta} \ + -V ${input_vcf} \ + -O ${final_output_filename} \ + ${"-L " + interval_list} \ + ${"--transcript-selection-mode " + transcript_selection_mode} \ + ${"--transcript-list " + transcript_selection_list} \ + --annotation-default normal_barcode:${control_id} \ + --annotation-default tumor_barcode:${case_id} \ + --annotation-default Center:${default="Unknown" sequencing_center} \ + --annotation-default source:${default="Unknown" sequence_source} \ + ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ + ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \ + ${filter_funcotations_args} \ + ${extra_args} + >>> + + runtime { + docker: gatk_docker + bootDiskSizeGb: 20 + memory: machine_mem + " MB" + disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" + preemptible: select_first([preemptible_attempts, 3]) + maxRetries: select_first([max_retries, 0]) + cpu: select_first([cpu, 1]) + } + + output { + File funcotated_output = "${final_output_filename}" + } + } diff --git a/mutect2_pon.inputs.json b/mutect2_pon.inputs.json index 4febcb6..b560c2d 100644 --- a/mutect2_pon.inputs.json +++ b/mutect2_pon.inputs.json @@ -3,23 +3,22 @@ "Mutect2_Panel.normal_bams": "Array[File]", "Mutect2_Panel.normal_bais": "Array[File]", "Mutect2_Panel.pon_name": "String", - "##_Mutect2_Panel.Mutect2.normal_bam": "(optional) File?", - "##_Mutect2_Panel.Mutect2.normal_bai": "(optional) File?", - + "#Mutect2_Panel.Mutect2.normal_bam": "(optional) File?", + "#Mutect2_Panel.Mutect2.normal_bai": "(optional) File?", "##_COMMENT2": "Primary resources", - "Mutect2_Panel.ref_fasta": "File", - "Mutect2_Panel.ref_fai": "File", - "Mutect2_Panel.ref_dict": "File", - "Mutect2_Panel.scatter_count": "Int", + "Mutect2_Panel.ref_fasta": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta", + "Mutect2_Panel.ref_fai": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta.fai", + "Mutect2_Panel.ref_dict": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.dict", + "Mutect2_Panel.scatter_count": "10", "##_COMMENT3": "Secondary resources", - "##_Mutect2_Panel.intervals": "(optional) File?", + "Mutect2_Panel.intervals": "gs://gatk-best-practices/somatic-b37/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list", "##_Mutect2_Panel.Mutect2.data_sources_tar_gz": "(optional) String?", - "##_Mutect2_Panel.Mutect2.variants_for_contamination": "(optional) File?", + "Mutect2_Panel.Mutect2.variants_for_contamination": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf", "##_Mutect2_Panel.Mutect2.variants_for_contamination_index": "(optional) File?", "##_Mutect2_Panel.Mutect2.tumor_sequencing_artifact_metrics": "(optional) File?", - "##_Mutect2_Panel.Mutect2.gnomad": "(optional) File?", + "Mutect2_Panel.Mutect2.gnomad": "gs://gatk-best-practices/somatic-b37/af-only-gnomad.raw.sites.vcf", "##_Mutect2_Panel.Mutect2.gnomad_index": "(optional) File?", "##_Mutect2_Panel.Mutect2.onco_ds_tar_gz": "(optional) File?", "##_Mutect2_Panel.Mutect2.pon": "(optional) File?", @@ -28,39 +27,39 @@ "##_Mutect2_Panel.gatk_override": "(optional) File?", "##_COMMENT4": "Secondary resources", - "##_Mutect2_Panel.duplicate_sample_strategy": "(optional) String?", - "##_Mutect2_Panel.m2_extra_args": "(optional) String?", - "##_Mutect2_Panel.Mutect2.onco_ds_local_db_dir": "(optional) String?", - "##_Mutect2_Panel.Mutect2.annotation_defaults": "(optional) Array[String]?", - "##_Mutect2_Panel.Mutect2.reference_version": "(optional) String?", - "##_Mutect2_Panel.Mutect2.annotation_overrides": "(optional) Array[String]?", - "##_Mutect2_Panel.Mutect2.artifact_modes": "(optional) Array[String]?", - "##_Mutect2_Panel.Mutect2.sequence_source": "(optional) String?", - "##_Mutect2_Panel.Mutect2.transcript_selection_list": "(optional) Array[String]?", - "##_Mutect2_Panel.Mutect2.split_intervals_extra_args": "(optional) String?", - "##_Mutect2_Panel.Mutect2.sequencing_center": "(optional) String?", - "##_Mutect2_Panel.Mutect2.transcript_selection_mode": "(optional) String?", - "##_Mutect2_Panel.Mutect2.m2_extra_filtering_args": "(optional) String?", + "#Mutect2_Panel.duplicate_sample_strategy": "(optional) String?", + "#Mutect2_Panel.m2_extra_args": "(optional) String?", + "#Mutect2_Panel.Mutect2.onco_ds_local_db_dir": "(optional) String?", + "#Mutect2_Panel.Mutect2.annotation_defaults": "(optional) Array[String]?", + "#Mutect2_Panel.Mutect2.reference_version": "(optional) String?", + "#Mutect2_Panel.Mutect2.annotation_oventerrides": "(optional) Array[String]?", + "Mutect2_Panel.Mutect2.artifact_modes": ["G/T", "C/T"], + "#Mutect2_Panel.Mutect2.sequence_source": "(optional) String?", + "#Mutect2_Panel.Mutect2.transcript_selection_list": "(optional) Array[String]?", + "#Mutect2_Panel.Mutect2.split_intervals_extra_args": "(optional) String?", + "#Mutect2_Panel.Mutect2.sequencing_center": "(optional) String?", + "#Mutect2_Panel.Mutect2.transcript_selection_mode": "(optional) String?", + "#Mutect2_Panel.Mutect2.m2_extra_filtering_args": "(optional) String?", "##_COMMENT5": "Boolean Options", - "##_Mutect2_Panel.Mutect2.run_oncotator": "(optional) Boolean?", - "##_Mutect2_Panel.Mutect2.make_bamout": "(optional) Boolean?", - "##_Mutect2_Panel.Mutect2.run_funcotator": "(optional) Boolean?", - "##_Mutect2_Panel.Mutect2.compress_vcfs": "(optional) Boolean?", - "##_Mutect2_Panel.Mutect2.run_orientation_bias_filter": "(optional) Boolean?", + "#Mutect2_Panel.Mutect2.run_oncotator": "(optional) Boolean?", + "#Mutect2_Panel.Mutect2.make_bamout": "(optional) Boolean?", + "#Mutect2_Panel.Mutect2.run_funcotator": "(optional) Boolean?", + "#Mutect2_Panel.Mutect2.compress_vcfs": "(optional) Boolean?", + "#Mutect2_Panel.Mutect2.run_orientation_bias_filter": "(optional) Boolean?", "##_COMMENT6": "Docker", - "Mutect2_Panel.gatk_docker": "String", - "##_Mutect2_Panel.Mutect2.oncotator_docker": "(optional) String?", + "Mutect2_Panel.gatk_docker": "broadinstitute/gatk:4.0.8.1", + "#Mutect2_Panel.Mutect2.oncotator_docker": "(optional) String?", "##_COMMENT7": "Disk space", - "##_Mutect2_Panel.Mutect2.emergency_extra_disk": "(optional) Int?", - "##_Mutect2_Panel.CreatePanel.disk_space": "(optional) Int?", + "#Mutect2_Panel.Mutect2.emergency_extra_disk": "(optional) Int?", + "#Mutect2_Panel.CreatePanel.disk_space": "(optional) Int?", "##_COMMENT8": "Preemptibles", - "##_Mutect2_Panel.preemptible_attempts": "(optional) Int?", + "#Mutect2_Panel.preemptible_attempts": "(optional) Int?", "##_COMMENT9": "Addtional Runtime Parameters", - "##_Mutect2_Panel.CreatePanel.mem": "(optional) Int?", - "##_Mutect2_Panel.CreatePanel.cpu": "(optional) Int?" + "#Mutect2_Panel.CreatePanel.mem": "(optional) Int?", + "#Mutect2_Panel.CreatePanel.cpu": "(optional) Int?" } diff --git a/mutect2_pon.wdl b/mutect2_pon.wdl index 42a5de0..bc8c7c8 100644 --- a/mutect2_pon.wdl +++ b/mutect2_pon.wdl @@ -34,6 +34,7 @@ workflow Mutect2_Panel { # runtime String gatk_docker Int? preemptible_attempts + Int? max_retries Array[Pair[File,File]] normal_bam_pairs = zip(normal_bams, normal_bais) @@ -53,26 +54,28 @@ workflow Mutect2_Panel { m2_extra_args = m2_extra_args, gatk_override = gatk_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + max_retries = max_retries } } call CreatePanel { input: - input_vcfs = Mutect2.unfiltered_vcf, - input_vcfs_idx = Mutect2.unfiltered_vcf_index, + input_vcfs = Mutect2.filtered_vcf, + input_vcfs_idx = Mutect2.filtered_vcf_index, duplicate_sample_strategy = duplicate_sample_strategy, output_vcf_name = pon_name, gatk_override = gatk_override, preemptible_attempts = preemptible_attempts, + max_retries = max_retries, gatk_docker = gatk_docker } output { File pon = CreatePanel.output_vcf File pon_idx = CreatePanel.output_vcf_index - Array[File] normal_calls = Mutect2.unfiltered_vcf - Array[File] normal_calls_idx = Mutect2.unfiltered_vcf_index + Array[File] normal_calls = Mutect2.filtered_vcf + Array[File] normal_calls_idx = Mutect2.filtered_vcf_index } } @@ -89,6 +92,7 @@ task CreatePanel { String gatk_docker Int? mem Int? preemptible_attempts + Int? max_retries Int? disk_space Int? cpu Boolean use_ssd = false @@ -108,6 +112,7 @@ task CreatePanel { memory: machine_mem + " GB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 3]) + maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) }