diff --git a/README.md b/README.md index bacaeb2..2215c4c 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,7 @@ Workflows for somatic short variant analysis with GATK4. ### mutect2 : -Implements Somatic short variant discovery using [GATK Best Practices](https://software.broadinstitute.org/gatk/best-practices/workflow). -Note: Also provided in this repo is mutect2_nio which is a NIO supported version of the wdl. +Implements Somatic short variant discovery using [GATK Best Practices](https://software.broadinstitute.org/gatk/best-practices/workflow). #### Requirements/expectations - Tumor bam and index @@ -37,10 +36,10 @@ Used to validate mutect2 workflow. - False Positive VCF files and its index with summary ### Software version requirements : -- GATK4.1.2.0 +- GATK4.1.4.0 Cromwell version support -- Successfully tested on v36 +- Successfully tested on v46 ### Parameter descriptions : @@ -104,7 +103,7 @@ By default the M2 WDL runs Funcotator for functional annotation and produce a TC - Runtime parameters are optimized for Broad's Google Cloud Platform implementation. - For help running workflows on the Google Cloud Platform or locally please view the following tutorial [(How to) Execute Workflows from the gatk-workflows Git Organization](https://software.broadinstitute.org/gatk/documentation/article?id=12521). -- The following material is provided by the GATK Team. Please post any questions or concerns to one of our forum sites : [GATK](https://gatkforums.broadinstitute.org/gatk/categories/ask-the-team/) , [FireCloud](https://gatkforums.broadinstitute.org/firecloud/categories/ask-the-firecloud-team) , [WDL/Cromwell](https://gatkforums.broadinstitute.org/wdl/categories/ask-the-wdl-team). +- The following material is provided by the GATK Team. Please post any questions or concerns to one of our forum sites : [GATK](https://gatkforums.broadinstitute.org/gatk/categories/ask-the-team/) , [Terra](https://support.terra.bio/hc/en-us/community/topics/360000500432) , [WDL/Cromwell](https://gatkforums.broadinstitute.org/wdl/categories/ask-the-wdl-team). - Please visit the [User Guide](https://software.broadinstitute.org/gatk/documentation/) site for further documentation on our workflows and tools. ### LICENSING : diff --git a/mutect2.exome.inputs.json b/mutect2.exome.inputs.json deleted file mode 100644 index 8ead5a7..0000000 --- a/mutect2.exome.inputs.json +++ /dev/null @@ -1,67 +0,0 @@ -{ - "##_COMMENT1": "Runtime", - "##Mutect2.oncotator_docker": "(optional) String?", - "Mutect2.gatk_docker": "broadinstitute/gatk:4.1.2.0", - - "##_COMMENT2": "Workflow options", - "Mutect2.intervals": "gs://gatk-best-practices/somatic-b37/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list", - "Mutect2.scatter_count": 50, - "Mutect2.artifact_modes": ["G/T", "C/T"], - "##_Mutect2.m2_extra_args": "(optional) String?", - "##_Mutect2.m2_extra_filtering_args": "(optional) String?", - "Mutect2.run_orientation_bias_filter": "False", - "Mutect2.run_oncotator": "False", - - "##_COMMENT3": "Primary inputs", - "Mutect2.ref_fasta": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta", - "Mutect2.ref_dict": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.dict", - "Mutect2.ref_fai": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta.fai", - "Mutect2.normal_reads": "gs://gatk-best-practices/somatic-b37/HCC1143_normal.bam", - "Mutect2.normal_reads_index": "gs://gatk-best-practices/somatic-b37/HCC1143_normal.bai", - "Mutect2.tumor_reads": "gs://gatk-best-practices/somatic-b37/HCC1143.bam", - "Mutect2.tumor_reads_index": "gs://gatk-best-practices/somatic-b37/HCC1143.bai", - - "##_COMMENT4": "Primary resources", - "Mutect2.pon": "gs://gatk-best-practices/somatic-b37/Mutect2-exome-panel.vcf", - "Mutect2.pon_idx": "gs://gatk-best-practices/somatic-b37/Mutect2-exome-panel.vcf.idx", - "Mutect2.gnomad": "gs://gatk-best-practices/somatic-b37/af-only-gnomad.raw.sites.vcf", - "Mutect2.gnomad_idx": "gs://gatk-best-practices/somatic-b37/af-only-gnomad.raw.sites.vcf.idx", - "Mutect2.variants_for_contamination": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf", - "Mutect2.variants_for_contamination_idx": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf.idx", - "##Mutect2.realignment_index_bundle": "File? (optional)", - - "##_COMMENT5": "Secondary resources", - "Mutect2.onco_ds_tar_gz": "gs://gatk-best-practices/somatic-b37/oncotator_v1_ds_April052016.tar.gz", - "Mutect2.default_config_file": "gs://gatk-best-practices/somatic-b37/onco_config.txt", - "##_Mutect2.sequencing_center": "(optional) String?", - "##_Mutect2.sequence_source": "(optional) String?", - - "##_COMMENT6": "Secondary resources", - "##_Mutect2.MergeBamOuts.mem": "(optional) Int?", - "##_Mutect2.SplitIntervals.mem": "(optional) Int?", - "##_Mutect2.M2.mem": "(optional) Int?", - "##_Mutect2.MergeVCFs.mem": "(optional) Int?", - "##_Mutect2.oncotate_m2.mem": "(optional) Int?", - - "##_COMMENT7": "Secondary resources", - "##_Mutect2.onco_ds_local_db_dir": "(optional) String?", - "##_Mutect2.sequencing_center": "(optional) String?", - "##_Mutect2.oncotate_m2.oncotator_exe": "(optional) String?", - "##_Mutect2.gatk4_override": "(optional) File?", - "##_Mutect2.CollectSequencingArtifactMetrics.mem": "(optional) Int?", - - "##_COMMENT8": "Disk space", - "##_Mutect2.MergeVCFs.disk_space_gb": "(optional) Int?", - "##_Mutect2.Filter.disk_space_gb": "(optional) Int?", - "##_Mutect2.M2.disk_space_gb": "(optional) Int?", - "##_Mutect2.M2.disk_space_gb": 100, - "##_Mutect2.oncotate_m2.disk_space_gb": "(optional) Int?", - "##_Mutect2.SplitIntervals.disk_space_gb": "(optional) Int?", - "##_Mutect2.MergeBamOuts.disk_space_gb": "(optional) Int?", - "##_Mutect2.CollectSequencingArtifactMetrics.disk_space_gb": "(optional) Int?", - "##_Mutect2.emergency_extra_disk": "(optional) Int?", - - "##_COMMENT9": "Preemptibles", - "##_Mutect2.MergeBamOuts.preemptible_attempts": "(optional) Int?", - "Mutect2.preemptible_attempts": 3 -} diff --git a/mutect2.inputs.json b/mutect2.inputs.json new file mode 100644 index 0000000..cbf3161 --- /dev/null +++ b/mutect2.inputs.json @@ -0,0 +1,28 @@ +{ + "Mutect2.gatk_docker": "broadinstitute/gatk:4.1.4.0", + + "Mutect2.intervals": "gs://gatk-best-practices/somatic-b37/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list", + "Mutect2.scatter_count": 50, + "Mutect2.m2_extra_args": "--downsampling-stride 20 --max-reads-per-alignment-start 6 --max-suspicious-reads-per-alignment-start 6", + "Mutect2.filter_funcotations": "True", + "Mutect2.funco_reference_version": "hg19", + "Mutect2.funco_data_sources_tar_gz": "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz", + "Mutect2.funco_transcript_selection_list": "gs://broad-public-datasets/funcotator/transcriptList.exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt", + + "Mutect2.ref_fasta": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta", + "Mutect2.ref_dict": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.dict", + "Mutect2.ref_fai": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta.fai", + "Mutect2.normal_reads": "gs://gatk-best-practices/somatic-b37/HCC1143_normal.bam", + "Mutect2.normal_reads_index": "gs://gatk-best-practices/somatic-b37/HCC1143_normal.bai", + "Mutect2.tumor_reads": "gs://gatk-best-practices/somatic-b37/HCC1143.bam", + "Mutect2.tumor_reads_index": "gs://gatk-best-practices/somatic-b37/HCC1143.bai", + + "Mutect2.pon": "gs://gatk-best-practices/somatic-b37/Mutect2-exome-panel.vcf", + "Mutect2.pon_idx": "gs://gatk-best-practices/somatic-b37/Mutect2-exome-panel.vcf.idx", + "Mutect2.gnomad": "gs://gatk-best-practices/somatic-b37/af-only-gnomad.raw.sites.vcf", + "Mutect2.gnomad_idx": "gs://gatk-best-practices/somatic-b37/af-only-gnomad.raw.sites.vcf.idx", + "Mutect2.variants_for_contamination": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf", + "Mutect2.variants_for_contamination_idx": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf.idx", + "Mutect2.realignment_index_bundle": "gs://gatk-test-data/mutect2/Homo_sapiens_assembly38.index_bundle" + +} diff --git a/mutect2.wdl b/mutect2.wdl index 99118b6..4f8d0c7 100644 --- a/mutect2.wdl +++ b/mutect2.wdl @@ -1,3 +1,5 @@ +version 1.0 + ## Copyright Broad Institute, 2017 ## ## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, @@ -9,8 +11,8 @@ ## Description of inputs: ## ## ** Runtime ** -## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator -## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) +## gatk_docker: docker image to use for GATK 4 Mutect2 +## preemptible: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) ## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors ## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar ## in the docker image. This must be supplied when running in an environment that does not support docker @@ -22,10 +24,6 @@ ## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) ## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) ## run_orientation_bias_mixture_model_filter: (optional) if true, filter orientation bias sites with the read orientation artifact mixture model. -## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a -## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on -## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image. -## (optional, false by default) ## ## ** Primary inputs ** ## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary @@ -38,9 +36,6 @@ ## variants_for_contamination, variants_for_contamination_idx: VCF of common variants (and its index)with allele frequencies for calculating contamination ## ## ** Secondary resources ** (for optional tasks) -## onco_ds_tar_gz, default_config_file: Oncotator datasources and config file -## sequencing_center, sequence_source: metadata for Oncotator -## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true ## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. ## ## Funcotator parameters (see Funcotator help for more details). @@ -71,83 +66,102 @@ ## authorized to run all programs before running this script. Please see the docker ## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information ## pertaining to the included programs. + +struct Runtime { + String gatk_docker + File? gatk_override + Int max_retries + Int preemptible + Int cpu + Int machine_mem + Int command_mem + Int disk + Int boot_disk_size +} + workflow Mutect2 { - # Mutect2 inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_reads - File tumor_reads_index - File? normal_reads - File? normal_reads_index - File? pon - File? pon_idx - Int scatter_count - File? gnomad - File? gnomad_idx - File? variants_for_contamination - File? variants_for_contamination_idx - File? realignment_index_bundle - String? realignment_extra_args - Boolean? run_orientation_bias_mixture_model_filter + input { + # Mutect2 inputs + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File tumor_reads + File tumor_reads_index + File? normal_reads + File? normal_reads_index + File? pon + File? pon_idx + Int scatter_count + File? gnomad + File? gnomad_idx + File? variants_for_contamination + File? variants_for_contamination_idx + File? realignment_index_bundle + String? realignment_extra_args + Boolean? run_orientation_bias_mixture_model_filter + String? m2_extra_args + String? m2_extra_filtering_args + String? split_intervals_extra_args + Boolean? make_bamout + Boolean? compress_vcfs + File? gga_vcf + File? gga_vcf_idx + + # Funcotator inputs + Boolean? run_funcotator + String? sequencing_center + String? sequence_source + String? funco_reference_version + String? funco_output_format + Boolean? funco_compress + Boolean? funco_use_gnomad_AF + File? funco_data_sources_tar_gz + String? funco_transcript_selection_mode + File? funco_transcript_selection_list + Array[String]? funco_annotation_defaults + Array[String]? funco_annotation_overrides + Array[String]? funcotator_excluded_fields + Boolean? funco_filter_funcotations + String? funcotator_extra_args + + String funco_default_output_format = "MAF" + + # runtime + String gatk_docker + File? gatk_override + String basic_bash_docker = "ubuntu:16.04" + Boolean? filter_funcotations + + Int? preemptible + Int? max_retries + Int small_task_cpu = 2 + Int small_task_mem = 4 + Int small_task_disk = 100 + Int boot_disk_size = 12 + Int learn_read_orientation_mem = 8000 + Int filter_alignment_artifacts_mem = 9000 + + # Use as a last resort to increase the disk given to every task in case of ill behaving data + Int? emergency_extra_disk + + # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes + # Large is for Bams/WGS vcfs + # Small is for metrics/other vcfs + Float large_input_to_output_multiplier = 2.25 + Float small_input_to_output_multiplier = 2.0 + Float cram_to_bam_multiplier = 6.0 + } + + Int preemptible_or_default = select_first([preemptible, 2]) + Int max_retries_or_default = select_first([max_retries, 2]) + + Boolean compress = select_first([compress_vcfs, false]) Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false]) - String? m2_extra_args - String? m2_extra_filtering_args - String? split_intervals_extra_args - Boolean? make_bamout Boolean make_bamout_or_default = select_first([make_bamout, false]) - Boolean? compress_vcfs - Boolean compress = select_first([compress_vcfs, false]) - File? gga_vcf - File? gga_vcf_idx - - # oncotator inputs - Boolean? run_oncotator - Boolean run_oncotator_or_default = select_first([run_oncotator, false]) - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? sequencing_center - String? sequence_source - File? default_config_file - String? oncotator_extra_args - - # Funcotator inputs - Boolean? run_funcotator Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - String? funco_reference_version - String? funco_output_format - Boolean? funco_compress - Boolean? funco_use_gnomad_AF - File? funco_data_sources_tar_gz - String? funco_transcript_selection_mode - File? funco_transcript_selection_list - Array[String]? funco_annotation_defaults - Array[String]? funco_annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? funco_filter_funcotations - String? funcotator_extra_args - - String funco_default_output_format = "MAF" - - - # runtime - String gatk_docker - File? gatk_override - String basic_bash_docker = "ubuntu:16.04" - String? oncotator_docker - String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"]) - Boolean? filter_oncotator_maf - Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true]) - Boolean? filter_funcotations Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) - Int? preemptible_attempts - Int? max_retries - - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk - # Disk sizes used for dynamic sizing Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB")) @@ -155,20 +169,12 @@ workflow Mutect2 { Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0 # If no tar is provided, the task downloads one from broads ftp server - Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100 Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100 Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 # This is added to every task as padding, should increase if systematically you need more disk for every call Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0]) - # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes - # Large is for Bams/WGS vcfs - # Small is for metrics/other vcfs - Float large_input_to_output_multiplier = 2.25 - Float small_input_to_output_multiplier = 2.0 - Float cram_to_bam_multiplier = 6.0 - # logic about output file names -- these are the names *without* .vcf extensions String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram String unfiltered_name = output_basename + "-unfiltered" @@ -177,10 +183,14 @@ workflow Mutect2 { String output_vcf_name = output_basename + ".vcf" - # Size M2 differently based on if we are using NIO or not Int tumor_cram_to_bam_disk = ceil(tumor_reads_size * cram_to_bam_multiplier) Int normal_cram_to_bam_disk = ceil(normal_reads_size * cram_to_bam_multiplier) + Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, + "max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu, + "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500, + "disk": small_task_disk + disk_pad, "boot_disk_size": boot_disk_size} + if (basename(tumor_reads) != basename(tumor_reads, ".cram")) { call CramToBam as TumorCramToBam { input: @@ -218,6 +228,7 @@ workflow Mutect2 { Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0 Int m2_output_size = tumor_bam_size / scatter_count + #TODO: do we need to change this disk size now that NIO is always going to happen (for the google backend only) Int m2_per_scatter_size = (tumor_bam_size + normal_bam_size) + ref_size + gnomad_vcf_size + m2_output_size + disk_pad call SplitIntervals { @@ -228,11 +239,7 @@ workflow Mutect2 { ref_dict = ref_dict, scatter_count = scatter_count, split_intervals_extra_args = split_intervals_extra_args, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime } scatter (subintervals in SplitIntervals.interval_files ) { @@ -250,7 +257,7 @@ workflow Mutect2 { pon_idx = pon_idx, gnomad = gnomad, gnomad_idx = gnomad_idx, - preemptible_attempts = preemptible_attempts, + preemptible = preemptible, max_retries = max_retries, m2_extra_args = m2_extra_args, variants_for_contamination = variants_for_contamination, @@ -264,26 +271,19 @@ workflow Mutect2 { gatk_docker = gatk_docker, disk_space = m2_per_scatter_size } - - Float sub_vcf_size = size(M2.unfiltered_vcf, "GB") - Float sub_bamout_size = size(M2.output_bamOut, "GB") } - call SumFloats as SumSubVcfs { - input: - sizes = sub_vcf_size, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } + Int merged_vcf_size = ceil(size(M2.unfiltered_vcf, "GB")) + Int merged_bamout_size = ceil(size(M2.output_bamOut, "GB")) + Int merged_tumor_pileups_size = ceil(size(M2.tumor_pileups, "GB")) + Int merged_normal_pileups_size = ceil(size(M2.tumor_pileups, "GB")) if (run_ob_filter) { call LearnReadOrientationModel { input: f1r2_tar_gz = M2.f1r2_counts, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries + runtime_params = standard_runtime, + mem = learn_read_orientation_mem } } @@ -293,21 +293,10 @@ workflow Mutect2 { input_vcf_indices = M2.unfiltered_vcf_idx, output_name = unfiltered_name, compress = compress, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime } if (make_bamout_or_default) { - call SumFloats as SumSubBamouts { - input: - sizes = sub_bamout_size, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } - call MergeBamOuts { input: ref_fasta = ref_fasta, @@ -315,19 +304,12 @@ workflow Mutect2 { ref_dict = ref_dict, bam_outs = M2.output_bamOut, output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), - gatk_override = gatk_override, - gatk_docker = gatk_docker, - disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad, - max_retries = max_retries + runtime_params = standard_runtime, + disk_space = ceil(merged_bamout_size * large_input_to_output_multiplier) + disk_pad, } } - call MergeStats { - input: - stats = M2.stats, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } + call MergeStats { input: stats = M2.stats, runtime_params = standard_runtime } if (defined(variants_for_contamination)) { call MergePileupSummaries as MergeTumorPileups { @@ -335,11 +317,7 @@ workflow Mutect2 { input_tables = M2.tumor_pileups, output_name = output_basename, ref_dict = ref_dict, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime } if (defined(normal_bam)){ @@ -348,23 +326,15 @@ workflow Mutect2 { input_tables = M2.normal_pileups, output_name = output_basename, ref_dict = ref_dict, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime } } call CalculateContamination { input: - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - gatk_docker = gatk_docker, tumor_pileups = MergeTumorPileups.merged_table, normal_pileups = MergeNormalPileups.merged_table, - disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad + runtime_params = standard_runtime } } @@ -373,58 +343,33 @@ workflow Mutect2 { ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, - gatk_override = gatk_override, - gatk_docker = gatk_docker, intervals = intervals, unfiltered_vcf = MergeVCFs.merged_vcf, unfiltered_vcf_idx = MergeVCFs.merged_vcf_idx, output_name = filtered_name, compress = compress, - preemptible_attempts = preemptible_attempts, mutect_stats = MergeStats.merged_stats, - max_retries = max_retries, contamination_table = CalculateContamination.contamination_table, maf_segments = CalculateContamination.maf_segments, artifact_priors_tar_gz = LearnReadOrientationModel.artifact_prior_table, m2_extra_filtering_args = m2_extra_filtering_args, + runtime_params = standard_runtime, disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad } if (defined(realignment_index_bundle)) { call FilterAlignmentArtifacts { input: - gatk_override = gatk_override, bam = tumor_bam, bai = tumor_bai, realignment_index_bundle = select_first([realignment_index_bundle]), realignment_extra_args = realignment_extra_args, - gatk_docker = gatk_docker, - max_retries = max_retries, compress = compress, output_name = filtered_name, input_vcf = Filter.filtered_vcf, - input_vcf_idx = Filter.filtered_vcf_idx - } - } - - if (run_oncotator_or_default) { - File oncotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) - call oncotate_m2 { - input: - m2_vcf = oncotate_vcf_input, - onco_ds_tar_gz = onco_ds_tar_gz, - onco_ds_local_db_dir = onco_ds_local_db_dir, - sequencing_center = sequencing_center, - sequence_source = sequence_source, - default_config_file = default_config_file, - case_id = M2.tumor_sample[0], - control_id = M2.normal_sample[0], - oncotator_docker = oncotator_docker_or_default, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad, - filter_maf = filter_oncotator_maf_or_default, - oncotator_extra_args = oncotator_extra_args + input_vcf_idx = Filter.filtered_vcf_idx, + runtime_params = standard_runtime, + mem = filter_alignment_artifacts_mem } } @@ -441,8 +386,8 @@ workflow Mutect2 { reference_version = select_first([funco_reference_version, "hg19"]), output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated", output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format, - compress = if defined(funco_compress) then funco_compress else false, - use_gnomad = if defined(funco_use_gnomad_AF) then funco_use_gnomad_AF else false, + compress = if defined(funco_compress) then select_first([funco_compress]) else false, + use_gnomad = if defined(funco_use_gnomad_AF) then select_first([funco_use_gnomad_AF]) else false, data_sources_tar_gz = funco_data_sources_tar_gz, case_id = M2.tumor_sample[0], control_id = M2.normal_sample[0], @@ -455,11 +400,8 @@ workflow Mutect2 { funcotator_excluded_fields = funcotator_excluded_fields, filter_funcotations = filter_funcotations_or_default, extra_args = funcotator_extra_args, - gatk_docker = gatk_docker, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad + runtime_params = standard_runtime, + disk_space = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + funco_tar_size + disk_pad } } @@ -470,7 +412,6 @@ workflow Mutect2 { File mutect_stats = MergeStats.merged_stats File? contamination_table = CalculateContamination.contamination_table - File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf File? funcotated_file = Funcotate.funcotated_output_file File? funcotated_file_index = Funcotate.funcotated_output_file_index File? bamout = MergeBamOuts.merged_bam_out @@ -481,15 +422,17 @@ workflow Mutect2 { } task CramToBam { - - File ref_fasta - File ref_fai - File ref_dict - File cram - File crai - String name - Int disk_size - Int? mem + input { + File ref_fasta + File ref_fai + File ref_dict + #cram and crai must be optional since Normal cram is optional + File? cram + File? crai + String name + Int disk_size + Int? mem + } Int machine_mem = if defined(mem) then mem * 1000 else 6000 @@ -499,10 +442,10 @@ task CramToBam { set -e set -o pipefail - samtools view -h -T ${ref_fasta} ${cram} | - samtools view -b -o ${name}.bam - - samtools index -b ${name}.bam - mv ${name}.bam.bai ${name}.bai + samtools view -h -T ~{ref_fasta} ~{cram} | + samtools view -b -o ~{name}.bam - + samtools index -b ~{name}.bam + mv ~{name}.bam.bai ~{name}.bai } runtime { @@ -512,57 +455,46 @@ task CramToBam { } output { - File output_bam = "${name}.bam" - File output_bai = "${name}.bai" + File output_bam = "~{name}.bam" + File output_bai = "~{name}.bai" } } task SplitIntervals { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - String? split_intervals_extra_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + Int scatter_count + String? split_intervals_extra_args + + # runtime + Runtime runtime_params + } command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} mkdir interval-files - gatk --java-options "-Xmx${command_mem}m" SplitIntervals \ - -R ${ref_fasta} \ - ${"-L " + intervals} \ - -scatter ${scatter_count} \ + gatk --java-options "-Xmx~{runtime_params.command_mem}m" SplitIntervals \ + -R ~{ref_fasta} \ + ~{"-L " + intervals} \ + -scatter ~{scatter_count} \ -O interval-files \ - ${split_intervals_extra_args} + ~{split_intervals_extra_args} cp interval-files/*.interval_list . } runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { @@ -571,92 +503,111 @@ task SplitIntervals { } task M2 { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? pon - File? pon_idx - File? gnomad - File? gnomad_idx - String? m2_extra_args - Boolean? make_bamout - Boolean? run_ob_filter - Boolean compress - File? gga_vcf - File? gga_vcf_idx - File? variants_for_contamination - File? variants_for_contamination_idx + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File tumor_bam + File tumor_bai + File? normal_bam + File? normal_bai + File? pon + File? pon_idx + File? gnomad + File? gnomad_idx + String? m2_extra_args + Boolean? make_bamout + Boolean? run_ob_filter + Boolean compress + File? gga_vcf + File? gga_vcf_idx + File? variants_for_contamination + File? variants_for_contamination_idx + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" String output_stats = output_vcf + ".stats" - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 Int command_mem = machine_mem - 500 + parameter_meta{ + intervals: {localization_optional: true} + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + tumor_bam: {localization_optional: true} + tumor_bai: {localization_optional: true} + normal_bam: {localization_optional: true} + normal_bai: {localization_optional: true} + pon: {localization_optional: true} + pon_idx: {localization_optional: true} + gnomad: {localization_optional: true} + gnomad_idx: {localization_optional: true} + gga_vcf: {localization_optional: true} + gga_vcf_idx: {localization_optional: true} + variants_for_contamination: {localization_optional: true} + variants_for_contamination_idx: {localization_optional: true} + } command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} # We need to create these files regardless, even if they stay empty touch bamout.bam touch f1r2.tar.gz echo "" > normal_name.txt - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode - tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`" + gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_bam} -O tumor_name.txt -encode + tumor_command_line="-I ~{tumor_bam} -tumor `cat tumor_name.txt`" - if [[ ! -z "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode - normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`" + if [[ ! -z "~{normal_bam}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_bam} -O normal_name.txt -encode + normal_command_line="-I ~{normal_bam} -normal `cat normal_name.txt`" fi - gatk --java-options "-Xmx${command_mem}m" Mutect2 \ - -R ${ref_fasta} \ + gatk --java-options "-Xmx~{command_mem}m" Mutect2 \ + -R ~{ref_fasta} \ $tumor_command_line \ $normal_command_line \ - ${"--germline-resource " + gnomad} \ - ${"-pon " + pon} \ - ${"-L " + intervals} \ - ${"--alleles " + gga_vcf} \ - -O "${output_vcf}" \ - ${true='--bam-output bamout.bam' false='' make_bamout} \ - ${true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ - ${m2_extra_args} + ~{"--germline-resource " + gnomad} \ + ~{"-pon " + pon} \ + ~{"-L " + intervals} \ + ~{"--alleles " + gga_vcf} \ + -O "~{output_vcf}" \ + ~{true='--bam-output bamout.bam' false='' make_bamout} \ + ~{true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ + ~{m2_extra_args} ### GetPileupSummaries # These must be created, even if they remain empty, as cromwell doesn't support optional output touch tumor-pileups.table touch normal-pileups.table - if [[ ! -z "${variants_for_contamination}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O tumor-pileups.table + if [[ ! -z "~{variants_for_contamination}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{tumor_bam} ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{variants_for_contamination} -L ~{variants_for_contamination} -O tumor-pileups.table - if [[ ! -z "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal-pileups.table + if [[ ! -z "~{normal_bam}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{normal_bam} ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{variants_for_contamination} -L ~{variants_for_contamination} -O normal-pileups.table fi fi >>> @@ -666,18 +617,18 @@ task M2 { bootDiskSizeGb: 12 memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) + preemptible: select_first([preemptible, 10]) maxRetries: select_first([max_retries, 0]) cpu: select_first([cpu, 1]) } output { - File unfiltered_vcf = "${output_vcf}" - File unfiltered_vcf_idx = "${output_vcf_idx}" + File unfiltered_vcf = "~{output_vcf}" + File unfiltered_vcf_idx = "~{output_vcf_idx}" File output_bamOut = "bamout.bam" String tumor_sample = read_string("tumor_name.txt") String normal_sample = read_string("normal_name.txt") - File stats = "${output_stats}" + File stats = "~{output_stats}" File f1r2_counts = "f1r2.tar.gz" File tumor_pileups = "tumor-pileups.table" File normal_pileups = "normal-pileups.table" @@ -685,147 +636,109 @@ task M2 { } task MergeVCFs { - # inputs - Array[File] input_vcfs - Array[File] input_vcf_indices - String output_name - Boolean compress + input { + Array[File] input_vcfs + Array[File] input_vcf_indices + String output_name + Boolean compress + Runtime runtime_params + } + String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1000 - # using MergeVcfs instead of GatherVcfs so we can create indices # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} + gatk --java-options "-Xmx~{runtime_params.command_mem}m" MergeVcfs -I ~{sep=' -I ' input_vcfs} -O ~{output_vcf} } runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File merged_vcf = "${output_vcf}" - File merged_vcf_idx = "${output_vcf_idx}" + File merged_vcf = "~{output_vcf}" + File merged_vcf_idx = "~{output_vcf_idx}" } } task MergeBamOuts { - # inputs - File ref_fasta - File ref_fai - File ref_dict - Array[File]+ bam_outs - String output_vcf_name - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 1000 + input { + File ref_fasta + File ref_fai + File ref_dict + Array[File]+ bam_outs + String output_vcf_name + Runtime runtime_params + Int? disk_space #override to request more disk than default small task params + } command <<< # This command block assumes that there is at least one file in bam_outs. # Do not call this task if len(bam_outs) == 0 set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \ - -I ${sep=" -I " bam_outs} -O unsorted.out.bam -R ${ref_fasta} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} + gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherBamFiles \ + -I ~{sep=" -I " bam_outs} -O unsorted.out.bam -R ~{ref_fasta} # We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence # overlapping bamouts - gatk --java-options "-Xmx${command_mem}m" SortSam -I unsorted.out.bam \ - -O ${output_vcf_name}.out.bam \ + gatk --java-options "-Xmx~{runtime_params.command_mem}m" SortSam -I unsorted.out.bam \ + -O ~{output_vcf_name}.out.bam \ --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT - gatk --java-options "-Xmx${command_mem}m" BuildBamIndex -I ${output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT + gatk --java-options "-Xmx~{runtime_params.command_mem}m" BuildBamIndex -I ~{output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT >>> runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File merged_bam_out = "${output_vcf_name}.out.bam" - File merged_bam_out_index = "${output_vcf_name}.out.bai" + File merged_bam_out = "~{output_vcf_name}.out.bam" + File merged_bam_out_index = "~{output_vcf_name}.out.bai" } } task MergeStats { - # inputs - Array[File]+ stats - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 2000 - Int command_mem = machine_mem - 1000 + input { + Array[File]+ stats + Runtime runtime_params + } command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeMutectStats \ - -stats ${sep=" -stats " stats} -O merged.stats + gatk --java-options "-Xmx~{runtime_params.command_mem}m" MergeMutectStats \ + -stats ~{sep=" -stats " stats} -O merged.stats } runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 10]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { @@ -834,85 +747,66 @@ task MergeStats { } task MergePileupSummaries { - # input_tables needs to be optional because GetPileupSummaries is in an if-block - Array[File?] input_tables - String output_name - File? gatk_override - File ref_dict - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1000 + input { + Array[File] input_tables + String output_name + File ref_dict + Runtime runtime_params + } command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - gatk --java-options "-Xmx${command_mem}m" GatherPileupSummaries \ - --sequence-dictionary ${ref_dict} \ - -I ${sep=' -I ' input_tables} \ - -O ${output_name}.tsv + gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherPileupSummaries \ + --sequence-dictionary ~{ref_dict} \ + -I ~{sep=' -I ' input_tables} \ + -O ~{output_name}.tsv } runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 3]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File merged_table = "${output_name}.tsv" + File merged_table = "~{output_name}.tsv" } } # Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 task LearnReadOrientationModel { - Array[File] f1r2_tar_gz - File? gatk_override - - # runtime - Int? max_retries - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false + input { + Array[File] f1r2_tar_gz + Runtime runtime_params + Int? mem #override memory + } - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 8000 + Int machine_mem = select_first([mem, runtime_params.machine_mem]) Int command_mem = machine_mem - 1000 command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \ - -I ${sep=" -I " f1r2_tar_gz} \ + gatk --java-options "-Xmx~{command_mem}m" LearnReadOrientationModel \ + -I ~{sep=" -I " f1r2_tar_gz} \ -O "artifact-priors.tar.gz" } runtime { - docker: gatk_docker - bootDiskSizeGb: 12 + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 3]) - cpu: select_first([cpu, 1]) + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { @@ -922,40 +816,30 @@ task LearnReadOrientationModel { } task CalculateContamination { - # inputs - String? intervals - File tumor_pileups - File? normal_pileups - - File? gatk_override - - # runtime - Int? preemptible_attempts - Int? max_retries - String gatk_docker - Int? disk_space - Int? mem - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3000 - Int command_mem = machine_mem - 500 + input { + String? intervals + File tumor_pileups + File? normal_pileups + Runtime runtime_params + } command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I ${tumor_pileups} \ - -O contamination.table --tumor-segmentation segments.table ${"-matched " + normal_pileups} + gatk --java-options "-Xmx~{runtime_params.command_mem}m" CalculateContamination -I ~{tumor_pileups} \ + -O contamination.table --tumor-segmentation segments.table ~{"-matched " + normal_pileups} } runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { @@ -965,257 +849,161 @@ task CalculateContamination { } task Filter { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File unfiltered_vcf - File unfiltered_vcf_idx - String output_name - Boolean compress + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File unfiltered_vcf + File unfiltered_vcf_idx + String output_name + Boolean compress + File? mutect_stats + File? artifact_priors_tar_gz + File? contamination_table + File? maf_segments + String? m2_extra_filtering_args + + Runtime runtime_params + Int? disk_space + } + String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File? mutect_stats - File? artifact_priors_tar_gz - File? contamination_table - File? maf_segments - String? m2_extra_filtering_args - - File? gatk_override - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 + parameter_meta{ + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + } command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \ - -R ${ref_fasta} \ - -O ${output_vcf} \ - ${"--contamination-table " + contamination_table} \ - ${"--tumor-segmentation " + maf_segments} \ - ${"--ob-priors " + artifact_priors_tar_gz} \ - ${"-stats " + mutect_stats} \ - --filtering-stats filtering.stats \ - ${m2_extra_filtering_args} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} + + gatk --java-options "-Xmx~{runtime_params.command_mem}m" FilterMutectCalls -V ~{unfiltered_vcf} \ + -R ~{ref_fasta} \ + -O ~{output_vcf} \ + ~{"--contamination-table " + contamination_table} \ + ~{"--tumor-segmentation " + maf_segments} \ + ~{"--ob-priors " + artifact_priors_tar_gz} \ + ~{"-stats " + mutect_stats} \ + --filtering-stats filtering.stats \ + ~{m2_extra_filtering_args} } runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_idx = "${output_vcf_idx}" + File filtered_vcf = "~{output_vcf}" + File filtered_vcf_idx = "~{output_vcf_idx}" File filtering_stats = "filtering.stats" } } task FilterAlignmentArtifacts { - #input - File? gatk_override - File input_vcf - File input_vcf_idx - File bam - File bai - String output_name - Boolean compress + input { + File input_vcf + File input_vcf_idx + File bam + File bai + String output_name + Boolean compress + File realignment_index_bundle + String? realignment_extra_args + Runtime runtime_params + Int mem + } + String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File realignment_index_bundle - String? realignment_extra_args - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 9000 + Int machine_mem = mem Int command_mem = machine_mem - 500 - command { - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterAlignmentArtifacts \ - -V ${input_vcf} \ - -I ${bam} \ - --bwa-mem-index-image ${realignment_index_bundle} \ - ${realignment_extra_args} \ - -O ${output_vcf} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_idx = "${output_vcf_idx}" + parameter_meta{ + input_vcf: {localization_optional: true} + input_vcf_idx: {localization_optional: true} + bam: {localization_optional: true} + bai: {localization_optional: true} } -} - -task oncotate_m2 { - # inputs - File m2_vcf - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? oncotator_exe - String? sequencing_center - String? sequence_source - File? default_config_file - String case_id - String? control_id - String? oncotator_extra_args - - # runtime - String oncotator_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - Boolean? filter_maf - Boolean is_filter_maf = select_first([filter_maf, true]) - String filter_maf_args = if (is_filter_maf) then " --collapse-filter-cols --prune-filter-cols " else "" - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - command <<< - # fail if *any* command below (not just the last) doesn't return 0, in particular if wget fails + command { set -e - # local db dir is a directory and has been specified - if [[ -d "${onco_ds_local_db_dir}" ]]; then - echo "Using local db-dir: ${onco_ds_local_db_dir}" - echo "THIS ONLY WORKS WITHOUT DOCKER!" - ln -s ${onco_ds_local_db_dir} onco_dbdir - elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then - echo "Using given tar file: ${onco_ds_tar_gz}" - mkdir onco_dbdir - tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1 - else - echo "Downloading and installing oncotator datasources from Broad FTP site..." - # Download and untar the db-dir - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/oncotator/oncotator_v1_ds_April052016.tar.gz - tar zxvf oncotator_v1_ds_April052016.tar.gz - ln -s oncotator_v1_ds_April052016 onco_dbdir - fi + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - ${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ - -v ${m2_vcf} ${case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --collapse-number-annotations --log_name oncotator.log \ - -a Center:${default="Unknown" sequencing_center} \ - -a source:${default="Unknown" sequence_source} \ - -a normal_barcode:${control_id} \ - -a tumor_barcode:${case_id} \ - ${"--default_config " + default_config_file} \ - ${filter_maf_args} \ - ${oncotator_extra_args} - >>> + gatk --java-options "-Xmx~{command_mem}m" FilterAlignmentArtifacts \ + -V ~{input_vcf} \ + -I ~{bam} \ + --bwa-mem-index-image ~{realignment_index_bundle} \ + ~{realignment_extra_args} \ + -O ~{output_vcf} + } runtime { - docker: oncotator_docker + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size memory: machine_mem + " MB" - bootDiskSizeGb: 12 - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File oncotated_m2_maf="${case_id}.maf.annotated" + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } -} - -# Calculates sum of a list of floats -task SumFloats { - Array[Float] sizes - - # Runtime parameters - Int? preemptible_attempts - Int? max_retries - - command <<< - python -c "print ${sep="+" sizes}" - >>> output { - Float total_size = read_float(stdout()) - } - - runtime { - docker: "python:2.7" - disks: "local-disk " + 10 + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) + File filtered_vcf = "~{output_vcf}" + File filtered_vcf_idx = "~{output_vcf_idx}" } } task Funcotate { - # ============== - # Inputs - File ref_fasta - File ref_fai - File ref_dict - File input_vcf - File input_vcf_idx - String reference_version - String output_file_base_name - String output_format - Boolean compress - Boolean use_gnomad - # This should be updated when a new version of the data sources is released - # TODO: Make this dynamically chosen in the command. - File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz" - String? control_id - String? case_id - String? sequencing_center - String? sequence_source - String? transcript_selection_mode - File? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? filter_funcotations - File? interval_list - - String? extra_args + input { + File ref_fasta + File ref_fai + File ref_dict + File input_vcf + File input_vcf_idx + String reference_version + String output_file_base_name + String output_format + Boolean compress + Boolean use_gnomad + # This should be updated when a new version of the data sources is released + # TODO: Make this dynamically chosen in the command. + File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz" + String? control_id + String? case_id + String? sequencing_center + String? sequence_source + String? transcript_selection_mode + File? transcript_selection_list + Array[String]? annotation_defaults + Array[String]? annotation_overrides + Array[String]? funcotator_excluded_fields + Boolean? filter_funcotations + File? interval_list + + String? extra_args + + # ============== + Runtime runtime_params + Int? disk_space #override to request more disk than default small task params + + # You may have to change the following two parameter values depending on the task requirements + Int default_ram_mb = 3000 + # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. + Int default_disk_space_gb = 100 + } # ============== # Process input args: @@ -1233,92 +1021,79 @@ task Funcotate { String interval_list_arg = if defined(interval_list) then " -L " else "" String extra_args_arg = select_first([extra_args, ""]) - # ============== - # Runtime options: - String gatk_docker - File? gatk_override - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space_gb - Int? cpu - - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - String dollar = "$" + parameter_meta{ + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + input_vcf: {localization_optional: true} + input_vcf_idx: {localization_optional: true} + } + command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} # Extract our data sources: echo "Extracting data sources zip file..." mkdir datasources_dir - tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 + tar zxvf ~{data_sources_tar_gz} -C datasources_dir --strip-components 1 DATA_SOURCES_FOLDER="$PWD/datasources_dir" # Handle gnomAD: - if ${use_gnomad} ; then + if ~{use_gnomad} ; then echo "Enabling gnomAD..." for potential_gnomad_gz in gnomAD_exome.tar.gz gnomAD_genome.tar.gz ; do - if [[ -f ${dollar}{DATA_SOURCES_FOLDER}/${dollar}{potential_gnomad_gz} ]] ; then - cd ${dollar}{DATA_SOURCES_FOLDER} - tar -zvxf ${dollar}{potential_gnomad_gz} + if [[ -f ~{dollar}{DATA_SOURCES_FOLDER}/~{dollar}{potential_gnomad_gz} ]] ; then + cd ~{dollar}{DATA_SOURCES_FOLDER} + tar -zvxf ~{dollar}{potential_gnomad_gz} cd - else - echo "ERROR: Cannot find gnomAD folder: ${dollar}{potential_gnomad_gz}" 1>&2 + echo "ERROR: Cannot find gnomAD folder: ~{dollar}{potential_gnomad_gz}" 1>&2 false fi done fi # Run Funcotator: - gatk --java-options "-Xmx${command_mem}m" Funcotator \ + gatk --java-options "-Xmx~{runtime_params.command_mem}m" Funcotator \ --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ${reference_version} \ - --output-file-format ${output_format} \ - -R ${ref_fasta} \ - -V ${input_vcf} \ - -O ${output_file} \ - ${interval_list_arg} ${default="" interval_list} \ - --annotation-default normal_barcode:${default="Unknown" control_id} \ - --annotation-default tumor_barcode:${default="Unknown" case_id} \ - --annotation-default Center:${default="Unknown" sequencing_center} \ - --annotation-default source:${default="Unknown" sequence_source} \ - ${"--transcript-selection-mode " + transcript_selection_mode} \ - ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \ - ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ - ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \ - ${excluded_fields_args}${default="" sep=" --exclude-field " funcotator_excluded_fields} \ - ${filter_funcotations_args} \ - ${extra_args_arg} + --ref-version ~{reference_version} \ + --output-file-format ~{output_format} \ + -R ~{ref_fasta} \ + -V ~{input_vcf} \ + -O ~{output_file} \ + ~{interval_list_arg} ~{default="" interval_list} \ + --annotation-default normal_barcode:~{default="Unknown" control_id} \ + --annotation-default tumor_barcode:~{default="Unknown" case_id} \ + --annotation-default Center:~{default="Unknown" sequencing_center} \ + --annotation-default source:~{default="Unknown" sequence_source} \ + ~{"--transcript-selection-mode " + transcript_selection_mode} \ + ~{transcript_selection_arg}~{default="" sep=" --transcript-list " transcript_selection_list} \ + ~{annotation_def_arg}~{default="" sep=" --annotation-default " annotation_defaults} \ + ~{annotation_over_arg}~{default="" sep=" --annotation-override " annotation_overrides} \ + ~{excluded_fields_args}~{default="" sep=" --exclude-field " funcotator_excluded_fields} \ + ~{filter_funcotations_args} \ + ~{extra_args_arg} # Make sure we have a placeholder index for MAF files so this workflow doesn't fail: - if [[ "${output_format}" == "MAF" ]] ; then - touch ${output_maf_index} + if [[ "~{output_format}" == "MAF" ]] ; then + touch ~{output_maf_index} fi >>> - runtime { - docker: gatk_docker - bootDiskSizeGb: 20 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } + runtime { + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size + memory: runtime_params.machine_mem + " MB" + disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu + } output { - File funcotated_output_file = "${output_file}" - File funcotated_output_file_index = "${output_file_index}" + File funcotated_output_file = "~{output_file}" + File funcotated_output_file_index = "~{output_file_index}" } - } +} diff --git a/mutect2_nio.wdl b/mutect2_nio.wdl deleted file mode 100644 index ff9f3c6..0000000 --- a/mutect2_nio.wdl +++ /dev/null @@ -1,1296 +0,0 @@ -## Copyright Broad Institute, 2017 -## -## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, -## and performs additional filtering and functional annotation tasks. -## -## NOTE: this wdl is an exact copy of mutect2.wdl in the gatk repo except for replacing File with String in GATK task inputs in order to -## avoid localizing files in cromwell and thereby allowing the GATK engine to access cloud-based files with NIO. Once -## cromwell supports "smart" File variables that know when and when not to localize the two wdls should be merged. -## -## Main requirements/expectations : -## - One analysis-ready BAM file (and its index) for each sample -## -## Description of inputs: -## -## ** Runtime ** -## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator -## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) -## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors -## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar -## in the docker image. This must be supplied when running in an environment that does not support docker -## (e.g. SGE cluster on a Broad on-prem VM) -## -## ** Workflow options ** -## intervals: genomic intervals (will be used for scatter) -## scatter_count: number of parallel jobs to generate when scattering over intervals -## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) -## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) -## run_orientation_bias_mixture_model_filter: (optional) if true, filter orientation bias sites with the read orientation artifact mixture model. -## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a -## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on -## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image. -## (optional, false by default) -## -## ** Primary inputs ** -## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary -## tumor_bam, tumor_bam_index: BAM and index for the tumor sample -## normal_bam, normal_bam_index: BAM and index for the normal sample -## -## ** Primary resources ** (optional but strongly recommended) -## pon: optional panel of normals in VCF format containing probable technical artifacts (false positves) -## gnomad: optional database of known germline variants (see http://gnomad.broadinstitute.org/downloads) -## variants_for_contamination: VCF of common variants with allele frequencies for calculating contamination -## -## ** Secondary resources ** (for optional tasks) -## onco_ds_tar_gz, default_config_file: Oncotator datasources and config file -## sequencing_center, sequence_source: metadata for Oncotator -## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true -## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. -## -## Funcotator parameters (see Funcotator help for more details). -## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19" -## funco_output_format: "MAF" to produce a MAF file, "VCF" to procude a VCF file. Default: "MAF" -## funco_compress: (Only valid if funco_output_format == "VCF" ) If true, will compress the output of Funcotator. If false, produces an uncompressed output file. Default: false -## funco_use_gnomad_AF: If true, will include gnomAD allele frequency annotations in output by connecting to the internet to query gnomAD (this impacts performance). If false, will not annotate with gnomAD. Default: false -## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT -## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process. -## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud. -## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as :. For example: "Center:Broad" -## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as :. For example: "Center:Broad" -## funcotator_excluded_fields: Annotations that should not appear in the output (VCF or MAF). Specified as . For example: "ClinVar_ALLELEID" -## funco_filter_funcotations: If true, will only annotate variants that have passed filtering (. or PASS value in the FILTER column). If false, will annotate all variants in the input file. Default: true -## funcotator_extra_args: Any additional arguments to pass to Funcotator. Default: "" -## -## Outputs : -## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam -## file of reassembled reads if requested -## -## Cromwell version support -## - Successfully tested on v34 -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information -## pertaining to the included programs. -workflow Mutect2 { - # Mutect2 inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_reads - File tumor_reads_index - File? normal_reads - File? normal_reads_index - File? pon - Int scatter_count - File? gnomad - File? variants_for_contamination - File? realignment_index_bundle - String? realignment_extra_args - Boolean? run_orientation_bias_mixture_model_filter - Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false]) - String? m2_extra_args - String? m2_extra_filtering_args - String? split_intervals_extra_args - Boolean? make_bamout - Boolean make_bamout_or_default = select_first([make_bamout, false]) - Boolean? compress_vcfs - Boolean compress = select_first([compress_vcfs, false]) - File? gga_vcf - - # oncotator inputs - Boolean? run_oncotator - Boolean run_oncotator_or_default = select_first([run_oncotator, false]) - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? sequencing_center - String? sequence_source - File? default_config_file - String? oncotator_extra_args - - # Funcotator inputs - Boolean? run_funcotator - Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - String? funco_reference_version - String? funco_output_format - Boolean? funco_compress - Boolean? funco_use_gnomad_AF - File? funco_data_sources_tar_gz - String? funco_transcript_selection_mode - File? funco_transcript_selection_list - Array[String]? funco_annotation_defaults - Array[String]? funco_annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? funco_filter_funcotations - String? funcotator_extra_args - - String funco_default_output_format = "MAF" - - - # runtime - String gatk_docker - File? gatk_override - String basic_bash_docker = "ubuntu:16.04" - String? oncotator_docker - String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"]) - Boolean? filter_oncotator_maf - Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true]) - Boolean? filter_funcotations - Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) - - Int? preemptible_attempts - Int? max_retries - - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk - - # Disk sizes used for dynamic sizing - Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) - Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB")) - Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB")) else 0 - Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0 - - # If no tar is provided, the task downloads one from broads ftp server - Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100 - Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100 - Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 - - # This is added to every task as padding, should increase if systematically you need more disk for every call - Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0]) - - # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes - # Large is for Bams/WGS vcfs - # Small is for metrics/other vcfs - Float large_input_to_output_multiplier = 2.25 - Float small_input_to_output_multiplier = 2.0 - Float cram_to_bam_multiplier = 6.0 - - # logic about output file names -- these are the names *without* .vcf extensions - String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram - String unfiltered_name = output_basename + "-unfiltered" - String filtered_name = output_basename + "-filtered" - String funcotated_name = output_basename + "-funcotated" - - String output_vcf_name = output_basename + ".vcf" - - # Size M2 differently based on if we are using NIO or not - Int tumor_cram_to_bam_disk = ceil(tumor_reads_size * cram_to_bam_multiplier) - Int normal_cram_to_bam_disk = ceil(normal_reads_size * cram_to_bam_multiplier) - - if (basename(tumor_reads) != basename(tumor_reads, ".cram")) { - call CramToBam as TumorCramToBam { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - cram = tumor_reads, - crai = tumor_reads_index, - name = output_basename, - disk_size = tumor_cram_to_bam_disk - } - } - - String normal_or_empty = select_first([normal_reads, ""]) - if (basename(normal_or_empty) != basename(normal_or_empty, ".cram")) { - String normal_basename = basename(basename(normal_or_empty, ".bam"),".cram") - call CramToBam as NormalCramToBam { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - cram = normal_reads, - crai = normal_reads_index, - name = normal_basename, - disk_size = normal_cram_to_bam_disk - } - } - - File tumor_bam = select_first([TumorCramToBam.output_bam, tumor_reads]) - File tumor_bai = select_first([TumorCramToBam.output_bai, tumor_reads_index]) - File? normal_bam = if defined(normal_reads) then select_first([NormalCramToBam.output_bam, normal_reads]) else normal_reads - File? normal_bai = if defined(normal_reads) then select_first([NormalCramToBam.output_bai, normal_reads_index]) else normal_reads_index - - Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB")) - Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0 - - Int m2_output_size = tumor_bam_size / scatter_count - Int m2_per_scatter_size = ((tumor_bam_size + normal_bam_size) / scatter_count) + ref_size + (gnomad_vcf_size / scatter_count) + m2_output_size + disk_pad - - call SplitIntervals { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - scatter_count = scatter_count, - split_intervals_extra_args = split_intervals_extra_args, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad - } - - scatter (subintervals in SplitIntervals.interval_files ) { - call M2 { - input: - intervals = subintervals, - ref_fasta = ref_fasta, - tumor_bam = tumor_bam, - normal_bam = normal_bam, - pon = pon, - gnomad = gnomad, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - m2_extra_args = m2_extra_args, - variants_for_contamination = variants_for_contamination, - make_bamout = make_bamout_or_default, - run_ob_filter = run_ob_filter, - compress = compress, - gga_vcf = gga_vcf, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - disk_space = m2_per_scatter_size - } - - Float sub_vcf_size = size(M2.unfiltered_vcf, "GB") - Float sub_bamout_size = size(M2.output_bamOut, "GB") - } - - call SumFloats as SumSubVcfs { - input: - sizes = sub_vcf_size, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } - - if (run_ob_filter) { - call LearnReadOrientationModel { - input: - f1r2_tar_gz = M2.f1r2_counts, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } - } - - call MergeVCFs { - input: - input_vcfs = M2.unfiltered_vcf, - input_vcf_indices = M2.unfiltered_vcf_idx, - output_name = unfiltered_name, - compress = compress, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad - } - - if (make_bamout_or_default) { - call SumFloats as SumSubBamouts { - input: - sizes = sub_bamout_size, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } - - call MergeBamOuts { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - bam_outs = M2.output_bamOut, - output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), - gatk_override = gatk_override, - gatk_docker = gatk_docker, - disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad, - max_retries = max_retries - } - } - - call MergeStats { - input: - stats = M2.stats, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - if (defined(variants_for_contamination)) { - call MergePileupSummaries as MergeTumorPileups { - input: - input_tables = M2.tumor_pileups, - output_name = output_basename, - ref_dict = ref_dict, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad - } - - if (defined(normal_bam)){ - call MergePileupSummaries as MergeNormalPileups { - input: - input_tables = M2.normal_pileups, - output_name = output_basename, - ref_dict = ref_dict, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad - } - } - - call CalculateContamination { - input: - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - gatk_docker = gatk_docker, - tumor_pileups = MergeTumorPileups.merged_table, - normal_pileups = MergeNormalPileups.merged_table, - disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad - } - } - - call Filter { - input: - ref_fasta = ref_fasta, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - unfiltered_vcf = MergeVCFs.merged_vcf, - output_name = filtered_name, - compress = compress, - preemptible_attempts = preemptible_attempts, - mutect_stats = MergeStats.merged_stats, - max_retries = max_retries, - contamination_table = CalculateContamination.contamination_table, - maf_segments = CalculateContamination.maf_segments, - artifact_priors_tar_gz = LearnReadOrientationModel.artifact_prior_table, - m2_extra_filtering_args = m2_extra_filtering_args, - disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad - } - - if (defined(realignment_index_bundle)) { - File realignment_filter_input = Filter.filtered_vcf - call FilterAlignmentArtifacts { - input: - gatk_override = gatk_override, - bam = tumor_bam, - realignment_index_bundle = select_first([realignment_index_bundle]), - realignment_extra_args = realignment_extra_args, - gatk_docker = gatk_docker, - max_retries = max_retries, - compress = compress, - output_name = filtered_name, - input_vcf = realignment_filter_input - } - } - - if (run_oncotator_or_default) { - File oncotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) - call oncotate_m2 { - input: - m2_vcf = oncotate_vcf_input, - onco_ds_tar_gz = onco_ds_tar_gz, - onco_ds_local_db_dir = onco_ds_local_db_dir, - sequencing_center = sequencing_center, - sequence_source = sequence_source, - default_config_file = default_config_file, - case_id = M2.tumor_sample[0], - control_id = M2.normal_sample[0], - oncotator_docker = oncotator_docker_or_default, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad, - filter_maf = filter_oncotator_maf_or_default, - oncotator_extra_args = oncotator_extra_args - } - } - - if (run_funcotator_or_default) { - File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) - File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) - call Funcotate { - input: - ref_fasta = ref_fasta, - input_vcf = funcotate_vcf_input, - input_vcf_idx = funcotate_vcf_input_index, - reference_version = select_first([funco_reference_version, "hg19"]), - output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated", - output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format, - compress = if defined(funco_compress) then funco_compress else false, - use_gnomad = if defined(funco_use_gnomad_AF) then funco_use_gnomad_AF else false, - data_sources_tar_gz = funco_data_sources_tar_gz, - case_id = M2.tumor_sample[0], - control_id = M2.normal_sample[0], - sequencing_center = sequencing_center, - sequence_source = sequence_source, - transcript_selection_mode = funco_transcript_selection_mode, - transcript_selection_list = funco_transcript_selection_list, - annotation_defaults = funco_annotation_defaults, - annotation_overrides = funco_annotation_overrides, - funcotator_excluded_fields = funcotator_excluded_fields, - filter_funcotations = filter_funcotations_or_default, - extra_args = funcotator_extra_args, - gatk_docker = gatk_docker, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad - } - } - - output { - File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) - File filtered_vcf_idx = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) - File filtering_stats = Filter.filtering_stats - File mutect_stats = MergeStats.merged_stats - File? contamination_table = CalculateContamination.contamination_table - - File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf - File? funcotated_file = Funcotate.funcotated_output_file - File? funcotated_file_index = Funcotate.funcotated_output_file_index - File? bamout = MergeBamOuts.merged_bam_out - File? bamout_index = MergeBamOuts.merged_bam_out_index - File? maf_segments = CalculateContamination.maf_segments - File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table - } -} - -task CramToBam { - - File ref_fasta - File ref_fai - File ref_dict - File cram - File crai - String name - Int disk_size - Int? mem - - Int machine_mem = if defined(mem) then mem * 1000 else 6000 - - #Calls samtools view to do the conversion - command { - #Set -e and -o says if any command I run fails in this script, make sure to return a failure - set -e - set -o pipefail - - samtools view -h -T ${ref_fasta} ${cram} | - samtools view -b -o ${name}.bam - - samtools index -b ${name}.bam - mv ${name}.bam.bai ${name}.bai - } - - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" - memory: machine_mem + " MB" - disks: "local-disk " + disk_size + " HDD" - } - - output { - File output_bam = "${name}.bam" - File output_bai = "${name}.bai" - } -} - -task SplitIntervals { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - String? split_intervals_extra_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - mkdir interval-files - gatk --java-options "-Xmx${command_mem}m" SplitIntervals \ - -R ${ref_fasta} \ - ${"-L " + intervals} \ - -scatter ${scatter_count} \ - -O interval-files \ - ${split_intervals_extra_args} - cp interval-files/*.interval_list . - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - Array[File] interval_files = glob("*.interval_list") - } -} - -task M2 { - # inputs - String? intervals - String ref_fasta - String tumor_bam - String? normal_bam - String? pon - String? gnomad - String? m2_extra_args - Boolean? make_bamout - Boolean? run_ob_filter - Boolean compress - String? gga_vcf - String? gga_vcf_idx - String? variants_for_contamination - - String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - - String output_stats = output_vcf + ".stats" - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - - command <<< - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - # We need to create these files regardless, even if they stay empty - touch bamout.bam - touch f1r2.tar.gz - echo "" > normal_name.txt - - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode - tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`" - - if [[ ! -z "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode - normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`" - fi - - gatk --java-options "-Xmx${command_mem}m" Mutect2 \ - -R ${ref_fasta} \ - $tumor_command_line \ - $normal_command_line \ - ${"--germline-resource " + gnomad} \ - ${"-pon " + pon} \ - ${"-L " + intervals} \ - ${"--alleles " + gga_vcf} \ - -O "${output_vcf}" \ - ${true='--bam-output bamout.bam' false='' make_bamout} \ - ${true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ - ${m2_extra_args} - - ### GetPileupSummaries - # These must be created, even if they remain empty, as cromwell doesn't support optional output - touch tumor-pileups.table - touch normal-pileups.table - - if [[ ! -z "${variants_for_contamination}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O tumor-pileups.table - - if [[ ! -z "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal-pileups.table - fi - fi - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File unfiltered_vcf = "${output_vcf}" - File unfiltered_vcf_idx = "${output_vcf_idx}" - File output_bamOut = "bamout.bam" - String tumor_sample = read_string("tumor_name.txt") - String normal_sample = read_string("normal_name.txt") - File stats = "${output_stats}" - File f1r2_counts = "f1r2.tar.gz" - File tumor_pileups = "tumor-pileups.table" - File normal_pileups = "normal-pileups.table" - } -} - -task MergeVCFs { - # inputs - Array[File] input_vcfs - Array[File] input_vcf_indices - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1000 - - # using MergeVcfs instead of GatherVcfs so we can create indices - # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_vcf = "${output_vcf}" - File merged_vcf_idx = "${output_vcf_idx}" - } -} - -task MergeBamOuts { - # inputs - File ref_fasta - File ref_fai - File ref_dict - Array[File]+ bam_outs - String output_vcf_name - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 1000 - - command <<< - # This command block assumes that there is at least one file in bam_outs. - # Do not call this task if len(bam_outs) == 0 - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \ - -I ${sep=" -I " bam_outs} -O unsorted.out.bam -R ${ref_fasta} - - # We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence - # overlapping bamouts - - gatk --java-options "-Xmx${command_mem}m" SortSam -I unsorted.out.bam \ - -O ${output_vcf_name}.out.bam \ - --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT - gatk --java-options "-Xmx${command_mem}m" BuildBamIndex -I ${output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_bam_out = "${output_vcf_name}.out.bam" - File merged_bam_out_index = "${output_vcf_name}.out.bai" - } -} - - -task MergeStats { - # inputs - Array[File]+ stats - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 2000 - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - - gatk --java-options "-Xmx${command_mem}m" MergeMutectStats \ - -stats ${sep=" -stats " stats} -O merged.stats - - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 10]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_stats = "merged.stats" - } -} - -task MergePileupSummaries { - # input_tables needs to be optional because GetPileupSummaries is in an if-block - Array[File?] input_tables - String output_name - File? gatk_override - File ref_dict - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" GatherPileupSummaries \ - --sequence-dictionary ${ref_dict} \ - -I ${sep=' -I ' input_tables} \ - -O ${output_name}.tsv - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 3]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_table = "${output_name}.tsv" - } -} - -# Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 -task LearnReadOrientationModel { - Array[File] f1r2_tar_gz - File? gatk_override - - # runtime - Int? max_retries - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 8000 - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \ - -I ${sep=" -I " f1r2_tar_gz} \ - -O "artifact-priors.tar.gz" - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 3]) - cpu: select_first([cpu, 1]) - } - - output { - File artifact_prior_table = "artifact-priors.tar.gz" - } - -} - -task CalculateContamination { - # inputs - String? intervals - File tumor_pileups - File? normal_pileups - - File? gatk_override - - # runtime - Int? preemptible_attempts - Int? max_retries - String gatk_docker - Int? disk_space - Int? mem - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3000 - Int command_mem = machine_mem - 500 - - command { - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I ${tumor_pileups} \ - -O contamination.table --tumor-segmentation segments.table ${"-matched " + normal_pileups} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - } - - output { - File contamination_table = "contamination.table" - File maf_segments = "segments.table" - } -} - -task Filter { - # inputs - String? intervals - String ref_fasta - String unfiltered_vcf - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File? mutect_stats - File? artifact_priors_tar_gz - File? contamination_table - File? maf_segments - String? m2_extra_filtering_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 - - command { - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \ - -R ${ref_fasta} \ - -O ${output_vcf} \ - ${"--contamination-table " + contamination_table} \ - ${"--tumor-segmentation " + maf_segments} \ - ${"--ob-priors " + artifact_priors_tar_gz} \ - ${"-stats " + mutect_stats} \ - --filtering-stats filtering.stats \ - ${m2_extra_filtering_args} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_idx = "${output_vcf_idx}" - File filtering_stats = "filtering.stats" - } -} - -task FilterAlignmentArtifacts { - #input - File? gatk_override - String input_vcf - String bam - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File realignment_index_bundle - String? realignment_extra_args - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 9000 - Int command_mem = machine_mem - 500 - - command { - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterAlignmentArtifacts \ - -V ${input_vcf} \ - -I ${bam} \ - --bwa-mem-index-image ${realignment_index_bundle} \ - ${realignment_extra_args} \ - -O ${output_vcf} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_idx = "${output_vcf_idx}" - } -} - -task oncotate_m2 { - # inputs - File m2_vcf - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? oncotator_exe - String? sequencing_center - String? sequence_source - File? default_config_file - String case_id - String? control_id - String? oncotator_extra_args - - # runtime - String oncotator_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - Boolean? filter_maf - Boolean is_filter_maf = select_first([filter_maf, true]) - String filter_maf_args = if (is_filter_maf) then " --collapse-filter-cols --prune-filter-cols " else "" - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command <<< - # fail if *any* command below (not just the last) doesn't return 0, in particular if wget fails - set -e - - # local db dir is a directory and has been specified - if [[ -d "${onco_ds_local_db_dir}" ]]; then - echo "Using local db-dir: ${onco_ds_local_db_dir}" - echo "THIS ONLY WORKS WITHOUT DOCKER!" - ln -s ${onco_ds_local_db_dir} onco_dbdir - elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then - echo "Using given tar file: ${onco_ds_tar_gz}" - mkdir onco_dbdir - tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1 - else - echo "Downloading and installing oncotator datasources from Broad FTP site..." - # Download and untar the db-dir - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/oncotator/oncotator_v1_ds_April052016.tar.gz - tar zxvf oncotator_v1_ds_April052016.tar.gz - ln -s oncotator_v1_ds_April052016 onco_dbdir - fi - - ${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ - -v ${m2_vcf} ${case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --collapse-number-annotations --log_name oncotator.log \ - -a Center:${default="Unknown" sequencing_center} \ - -a source:${default="Unknown" sequence_source} \ - -a normal_barcode:${control_id} \ - -a tumor_barcode:${case_id} \ - ${"--default_config " + default_config_file} \ - ${filter_maf_args} \ - ${oncotator_extra_args} - >>> - - runtime { - docker: oncotator_docker - memory: machine_mem + " MB" - bootDiskSizeGb: 12 - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File oncotated_m2_maf="${case_id}.maf.annotated" - } -} - -# Calculates sum of a list of floats -task SumFloats { - Array[Float] sizes - - # Runtime parameters - Int? preemptible_attempts - Int? max_retries - - command <<< - python -c "print ${sep="+" sizes}" - >>> - - output { - Float total_size = read_float(stdout()) - } - - runtime { - docker: "python:2.7" - disks: "local-disk " + 10 + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - } -} - -task Funcotate { - # ============== - # Inputs - String ref_fasta - String input_vcf - String input_vcf_idx - String reference_version - String output_file_base_name - String output_format - Boolean compress - Boolean use_gnomad - # This should be updated when a new version of the data sources is released - # TODO: Make this dynamically chosen in the command. - File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz" - String? control_id - String? case_id - String? sequencing_center - String? sequence_source - String? transcript_selection_mode - File? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? filter_funcotations - File? interval_list - - String? extra_args - - # ============== - # Process input args: - String output_maf = output_file_base_name + ".maf" - String output_maf_index = output_maf + ".idx" - String output_vcf = output_file_base_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - String output_file = if output_format == "MAF" then output_maf else output_vcf - String output_file_index = if output_format == "MAF" then output_maf_index else output_vcf_idx - String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else "" - String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" - String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" - String filter_funcotations_args = if defined(filter_funcotations) && (filter_funcotations) then " --remove-filtered-variants " else "" - String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else "" - String interval_list_arg = if defined(interval_list) then " -L " else "" - String extra_args_arg = select_first([extra_args, ""]) - - # ============== - # Runtime options: - String gatk_docker - File? gatk_override - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space_gb - Int? cpu - - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - String dollar = "$" - - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - # Extract our data sources: - echo "Extracting data sources zip file..." - mkdir datasources_dir - tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 - DATA_SOURCES_FOLDER="$PWD/datasources_dir" - - # Handle gnomAD: - if ${use_gnomad} ; then - echo "Enabling gnomAD..." - for potential_gnomad_gz in gnomAD_exome.tar.gz gnomAD_genome.tar.gz ; do - if [[ -f ${dollar}{DATA_SOURCES_FOLDER}/${dollar}{potential_gnomad_gz} ]] ; then - cd ${dollar}{DATA_SOURCES_FOLDER} - tar -zvxf ${dollar}{potential_gnomad_gz} - cd - - else - echo "ERROR: Cannot find gnomAD folder: ${dollar}{potential_gnomad_gz}" 1>&2 - false - fi - done - fi - - # Run Funcotator: - gatk --java-options "-Xmx${command_mem}m" Funcotator \ - --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ${reference_version} \ - --output-file-format ${output_format} \ - -R ${ref_fasta} \ - -V ${input_vcf} \ - -O ${output_file} \ - ${interval_list_arg} ${default="" interval_list} \ - --annotation-default normal_barcode:${default="Unknown" control_id} \ - --annotation-default tumor_barcode:${default="Unknown" case_id} \ - --annotation-default Center:${default="Unknown" sequencing_center} \ - --annotation-default source:${default="Unknown" sequence_source} \ - ${"--transcript-selection-mode " + transcript_selection_mode} \ - ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \ - ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ - ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \ - ${excluded_fields_args}${default="" sep=" --exclude-field " funcotator_excluded_fields} \ - ${filter_funcotations_args} \ - ${extra_args_arg} - # Make sure we have a placeholder index for MAF files so this workflow doesn't fail: - if [[ "${output_format}" == "MAF" ]] ; then - touch ${output_maf_index} - fi - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 20 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File funcotated_output_file = "${output_file}" - File funcotated_output_file_index = "${output_file_index}" - } - } diff --git a/mutect2_pon.inputs.json b/mutect2_pon.inputs.json index 7829330..7f809d1 100644 --- a/mutect2_pon.inputs.json +++ b/mutect2_pon.inputs.json @@ -1,20 +1,16 @@ { - "##_COMMENT1": "Inputs", "Mutect2_Panel.pon_name":"panel_of_normal", "Mutect2_Panel.normal_bams":"Array[File]", "Mutect2_Panel.normal_bais":"Array[File]", - "##_COMMENT2": "Primary resources", "Mutect2_Panel.ref_fasta":"gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta", "Mutect2_Panel.ref_fai":"gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta.fai", "Mutect2_Panel.ref_dict":"gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.dict", "Mutect2_Panel.scatter_count":10, - "##_COMMENT4": "Secondary resources", "Mutect2_Panel.intervals":"gs://gatk-best-practices/somatic-b37/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list", "Mutect2_Panel.Mutect2.variants_for_contamination":"gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf", "Mutect2_Panel.gnomad":"gs://gatk-best-practices/somatic-b37/af-only-gnomad.raw.sites.vcf", - "##_COMMENT6": "Docker", - "Mutect2_Panel.gatk_docker":"broadinstitute/gatk:4.1.2.0" + "Mutect2_Panel.gatk_docker":"broadinstitute/gatk:4.1.4.0" } diff --git a/mutect2_pon.wdl b/mutect2_pon.wdl index 3b794bb..1176a89 100644 --- a/mutect2_pon.wdl +++ b/mutect2_pon.wdl @@ -1,3 +1,5 @@ +version 1.0 + # Create a Mutect2 panel of normals # # Description of inputs @@ -9,18 +11,21 @@ # m2_extra_args: additional command line parameters for Mutect2. This should not involve --max-mnp-distance, # which the wdl hard-codes to 0 because GenpmicsDBImport can't handle MNPs -import "https://raw.githubusercontent.com/gatk-workflows/gatk4-somatic-snvs-indels/2.5.0/mutect2_nio.wdl" as m2 +#import "mutect2.wdl" as m2 + +import "https://raw.githubusercontent.com/gatk-workflows/gatk4-somatic-snvs-indels/2.6.0/mutect2.wdl" as m2 workflow Mutect2_Panel { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - Array[String] normal_bams - Array[String] normal_bais - String gnomad + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + Int scatter_count + Array[String] normal_bams + Array[String] normal_bais + File gnomad + File gnomad_idx String? m2_extra_args String? create_pon_extra_args Boolean? compress @@ -28,14 +33,31 @@ workflow Mutect2_Panel { Int? min_contig_size Int? num_contigs - Int contig_size = select_first([min_contig_size, 1000000]) - - File? gatk_override # runtime String gatk_docker - Int? preemptible_attempts + File? gatk_override + String basic_bash_docker = "ubuntu:16.04" + + Int? preemptible Int? max_retries + Int small_task_cpu = 2 + Int small_task_mem = 4 + Int small_task_disk = 100 + Int boot_disk_size = 12 + + # Use as a last resort to increase the disk given to every task in case of ill behaving data + Int? emergency_extra_disk + } + + Int contig_size = select_first([min_contig_size, 1000000]) + Int preemptible_or_default = select_first([preemptible, 2]) + Int max_retries_or_default = select_first([max_retries, 2]) + + Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, + "max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu, + "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500, + "disk": small_task_disk, "boot_disk_size": boot_disk_size} scatter (normal_bam in zip(normal_bams, normal_bais)) { call m2.Mutect2 { @@ -50,7 +72,7 @@ workflow Mutect2_Panel { m2_extra_args = select_first([m2_extra_args, ""]) + "--max-mnp-distance 0", gatk_override = gatk_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, + preemptible = preemptible, max_retries = max_retries } } @@ -62,8 +84,7 @@ workflow Mutect2_Panel { ref_dict = ref_dict, scatter_count = select_first([num_contigs, 24]), split_intervals_extra_args = "--subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION --min-contig-size " + contig_size, - gatk_override = gatk_override, - gatk_docker = gatk_docker + runtime_params = standard_runtime } scatter (subintervals in SplitIntervals.interval_files ) { @@ -75,12 +96,10 @@ workflow Mutect2_Panel { ref_fai = ref_fai, ref_dict = ref_dict, gnomad = gnomad, + gnomad_idx = gnomad_idx, output_vcf_name = pon_name, create_pon_extra_args = create_pon_extra_args, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - gatk_docker = gatk_docker + runtime_params = standard_runtime } } @@ -90,10 +109,7 @@ workflow Mutect2_Panel { input_vcf_indices = CreatePanel.output_vcf_index, output_name = pon_name, compress = select_first([compress, false]), - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries + runtime_params = standard_runtime } output { @@ -105,49 +121,51 @@ workflow Mutect2_Panel { } task CreatePanel { - # inputs - File intervals - Array[String] input_vcfs - File ref_fasta - File ref_fai - File ref_dict - String output_vcf_name - String gnomad - String? create_pon_extra_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space + input { + File intervals + Array[String] input_vcfs + File ref_fasta + File ref_fai + File ref_dict + String output_vcf_name + File gnomad + File gnomad_idx + String? create_pon_extra_args + + # runtime + Runtime runtime_params + } - Int machine_mem = select_first([mem, 8]) + Int machine_mem = 8 Int command_mem = machine_mem - 1 + parameter_meta{ + gnomad: {localization_optional: true} + gnomad_idx: {localization_optional: true} + } + command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - gatk GenomicsDBImport --genomicsdb-workspace-path pon_db -R ${ref_fasta} -V ${sep=' -V ' input_vcfs} -L ${intervals} + gatk GenomicsDBImport --genomicsdb-workspace-path pon_db -R ~{ref_fasta} -V ~{sep=' -V ' input_vcfs} -L ~{intervals} - gatk --java-options "-Xmx${command_mem}g" CreateSomaticPanelOfNormals -R ${ref_fasta} --germline-resource ${gnomad} \ - -V gendb://pon_db -O ${output_vcf_name}.vcf ${create_pon_extra_args} + gatk --java-options "-Xmx~{command_mem}g" CreateSomaticPanelOfNormals -R ~{ref_fasta} --germline-resource ~{gnomad} \ + -V gendb://pon_db -O ~{output_vcf_name}.vcf ~{create_pon_extra_args} } runtime { - docker: gatk_docker - bootDiskSizeGb: 12 + docker: runtime_params.gatk_docker + bootDiskSizeGb: runtime_params.boot_disk_size memory: machine_mem + " GB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 3]) - maxRetries: select_first([max_retries, 0]) + disks: "local-disk " + runtime_params.disk + " HDD" + preemptible: runtime_params.preemptible + maxRetries: runtime_params.max_retries + cpu: runtime_params.cpu } output { - File output_vcf = "${output_vcf_name}.vcf" - File output_vcf_index = "${output_vcf_name}.vcf.idx" + File output_vcf = "~{output_vcf_name}.vcf" + File output_vcf_index = "~{output_vcf_name}.vcf.idx" } }