diff --git a/bin/evidence_string_generation.py b/bin/evidence_string_generation.py index c3c2fd45..c24e8f20 100755 --- a/bin/evidence_string_generation.py +++ b/bin/evidence_string_generation.py @@ -9,12 +9,10 @@ parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True) parser.add_argument('--ot-schema', help='OpenTargets schema JSON', required=True) parser.add_argument('--out', help='Output directory', required=True) -parser.add_argument('--include-structural', help='Use structural variants consequence prediction pipeline', - action='store_true', default=False, required=False) if __name__ == '__main__': args = parser.parse_args() clinvar_to_evidence_strings.launch_pipeline( clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping, - ot_schema_file=args.ot_schema, dir_out=args.out, include_structural=args.include_structural) + ot_schema_file=args.ot_schema, dir_out=args.out) diff --git a/consequence_prediction/run_repeat_expansion_variants.py b/consequence_prediction/run_repeat_expansion_variants.py index abc04d4b..e9e2ecaa 100755 --- a/consequence_prediction/run_repeat_expansion_variants.py +++ b/consequence_prediction/run_repeat_expansion_variants.py @@ -14,7 +14,7 @@ help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.' ) parser.add_argument( - '--output-dataframe', required=True, + '--output-dataframe', required=False, help='File to output full dataframe for subsequent analysis and debugging.' ) args = parser.parse_args() diff --git a/consequence_prediction/run_structural_variants.py b/consequence_prediction/run_structural_variants.py new file mode 100755 index 00000000..cd2117c2 --- /dev/null +++ b/consequence_prediction/run_structural_variants.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +"""A wrapper script for running the repeat expansion pipeline.""" + +import argparse +import structural_variants.pipeline + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--clinvar-xml', required=True, + help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)' +) +parser.add_argument( + '--output-consequences', required=True, + help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.' +) + +args = parser.parse_args() +structural_variants.pipeline.main(args.clinvar_xml, args.output_consequences) diff --git a/consequence_prediction/structural_variants/pipeline.py b/consequence_prediction/structural_variants/pipeline.py index 232ec150..214f7ca7 100644 --- a/consequence_prediction/structural_variants/pipeline.py +++ b/consequence_prediction/structural_variants/pipeline.py @@ -68,7 +68,16 @@ def get_vep_results(clinvar_xml): return vep_results -def main(clinvar_xml): +def generate_consequences_file(consequences, output_consequences): + """Output final table.""" + if consequences.empty: + logger.info('There are no records ready for output') + return + # Write the consequences table. This is used by the main evidence string generation pipeline. + consequences.to_csv(output_consequences, sep='\t', index=False, header=False) + + +def main(clinvar_xml, output_consequences=None): vep_results = get_vep_results(clinvar_xml) results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'}) variant_data = [] @@ -82,4 +91,6 @@ def main(clinvar_xml): # Return as a dataframe to be compatible with repeat expansion pipeline consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID', 'EnsemblGeneName', 'ConsequenceTerm')) + if output_consequences is not None: + generate_consequences_file(consequences, output_consequences) return consequences diff --git a/docs/build.md b/docs/build.md index 5900aa84..9003b71d 100644 --- a/docs/build.md +++ b/docs/build.md @@ -52,6 +52,14 @@ export PYTHONPATH=${INSTALL_PATH} The installed Python version can then be called with either `python` or `python3`. You can also use either `pip` or `pip3` to install packages into this local distribution. +## Nextflow installation + +The evidence string generation pipeline uses Nextflow, which itself relies on Java. You can install in the current directory as follows: +```bash +wget -qO- https://get.nextflow.io | bash +``` +You can then include this in your `$PATH` variable if necessary, or invoke the executable directly. For more details on installing Nextflow, see the [documentation](https://www.nextflow.io/docs/latest/getstarted.html). + ## Deploying local OLS installation During the preparation of 2019_04 release, which had to be synchronized with EFO v3, OLS had to be deployed locally because the production deployment of OLS on www.ebi.ac.uk/ols only supported EFO v2 at the time. This can be done using the following command (substitute the image version as appropriate): diff --git a/docs/environment.md b/docs/environment.md index e7125a35..788b9acd 100644 --- a/docs/environment.md +++ b/docs/environment.md @@ -1,9 +1,9 @@ # Setting up the common environment -1. Log in to the LSF cluster, where all data processing must take place. +1. Log in to the LSF cluster (currently `codon`), where all data processing must take place. 1. Using a `become` command, switch to a common EVA production user instead of your personal account. 1. Adjust and execute the commands below. They will set up the environment, fetch and build the code. Notes: - - The first five variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md). + - The first six variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md). - By modifying the `GIT_REMOTE` and `GIT_BRANCH` variables, you can run an arbitrary version of the pipeline. This can be used for development and debugging. By default it will fetch the master branch from the main pipeline repository. - Running these commands will overwrite any local changes you had in the repository copy on the cluster. @@ -17,6 +17,9 @@ export PYTHON_INSTALL_PATH= # Location of bcftools installation path export BCFTOOLS_INSTALL_PATH= +# Location of Nextflow installation path +export NEXTFLOW_INSTALL_PATH= + # The directory where subdirectories for each batch will be created export BATCH_ROOT_BASE= @@ -26,8 +29,8 @@ export FTP_PATH_BASE= # Base bsub command line for all commands. export BSUB_CMDLINE="bsub" -# Setting up Python paths -export PATH=${PYTHON_INSTALL_PATH}:${BCFTOOLS_INSTALL_PATH}:$PATH +# Setting up paths +export PATH=${PYTHON_INSTALL_PATH}:${BCFTOOLS_INSTALL_PATH}:${NEXTFLOW_INSTALL_PATH}:$PATH export PYTHONPATH=${PYTHON_INSTALL_PATH} # External service paths @@ -44,4 +47,7 @@ source env/bin/activate python3 -m pip -q install --upgrade pip setuptools python3 -m pip -q install -r requirements.txt python3 setup.py install + +# Location of Python executable, pointing to the virtualenv +export PYTHON_BIN=${CODE_ROOT}/env/bin/python ``` diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md index 49bcd3b0..e69e6e86 100644 --- a/docs/generate-evidence-strings.md +++ b/docs/generate-evidence-strings.md @@ -18,7 +18,7 @@ Next, set up the protocol-specific environment: export OT_RELEASE=YYYY-MM # Open Targets JSON schema version. -export OT_SCHEMA_VERSION=2.2.6 +export OT_SCHEMA_VERSION=2.2.8 ``` ## 1. Process data @@ -31,58 +31,15 @@ mkdir -p ${BATCH_ROOT} cd ${BATCH_ROOT} mkdir -p clinvar gene_mapping evidence_strings logs -# Download ClinVar data. We always use the most recent XML dump, which contains all data for the release. -wget \ - -O ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \ - https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz - -# Download the Open Targets JSON schema. -wget \ - -O ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \ - https://raw.githubusercontent.com/opentargets/json_schema/${OT_SCHEMA_VERSION}/opentargets.json - -# Run ClinVar variants through VEP and map them to genes and functional consequences. -${BSUB_CMDLINE} -K -M 10G \ - -o ${BATCH_ROOT}/logs/consequence_vep.out \ - -e ${BATCH_ROOT}/logs/consequence_vep.err \ - bash ${CODE_ROOT}/consequence_prediction/run_consequence_mapping.sh \ - ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \ - ${BATCH_ROOT}/gene_mapping/consequences_vep.tsv - -# Generate the evidence strings for submission to Open Targets. -${BSUB_CMDLINE} -K -M 10G \ - -o ${BATCH_ROOT}/logs/evidence_string_generation.out \ - -e ${BATCH_ROOT}/logs/evidence_string_generation.err \ - python3 ${CODE_ROOT}/bin/evidence_string_generation.py \ - --clinvar-xml ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \ - --efo-mapping ${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \ - --gene-mapping ${BATCH_ROOT}/gene_mapping/consequences_vep.tsv \ - --ot-schema ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \ - --out ${BATCH_ROOT}/evidence_strings/ \ - --include-structural - -# Check that the generated evidence strings do not contain any duplicated evidence strings. -# For every evidence string, we group the value of fields datatypeId, studyId, -# targetFromSourceId, variantId, variantFunctionalConsequenceId and diseaseFromSourceMappedId, -# all separated by tabs, sorted and saved at duplicates.tsv if found duplicated. -jq --arg sep $'\t' -jr \ - '.datatypeId,$sep,.studyId,$sep,.targetFromSourceId,$sep,.variantId,$sep,.variantFunctionalConsequenceId,$sep,.diseaseFromSourceMappedId,$sep,.diseaseFromSource,"\n"' \ - ${BATCH_ROOT}/evidence_strings/evidence_strings.json \ - | sort | uniq -d > ${BATCH_ROOT}/evidence_strings/duplicates.tsv - -# Convert MedGen and OMIM cross-references into ZOOMA format. -${BSUB_CMDLINE} -K \ - -o ${BATCH_ROOT}/logs/traits_to_zooma_format.out \ - -e ${BATCH_ROOT}/logs/traits_to_zooma_format.err \ - python3 ${CODE_ROOT}/bin/traits_to_zooma_format.py \ - --clinvar-xml ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \ - --zooma-feedback ${BATCH_ROOT}/clinvar/clinvar_xrefs.txt +# Run the nextflow pipeline, resuming execution of previous attempt if possible. +nextflow run ${CODE_ROOT}/eva_cttv_pipeline/evidence_string_generation/pipeline.nf \ + --batch_root ${BATCH_ROOT} \ + --schema ${OT_SCHEMA_VERSION} \ + -resume ``` -## 2. Manual follow-up actions - -### Check that generated evidence strings do not contain any duplicates -The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the file `${BATCH_ROOT}/evidence_strings/duplicates.tsv` should be empty. Check that this is the case. +### Note on duplication checks +The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the automated pipeline should fail with an error if duplicates are detected. A repeated evidence string will have identical values for these five fields: * **datatypeId** - Identifier of the type of data we are associating, varying between somatic and non-somatic ClinVar records (*e.g.* ``somatic_mutation`` or ``genetic_association`` respectively). @@ -94,6 +51,8 @@ A repeated evidence string will have identical values for these five fields: Nevertheless, we also report evidence strings in which ``diseaseFromSourceMappedId`` may be empty (``diseaseFromSourceMappedId: null``) - i.e. the phenotype has not been mapped to an ontology yet. Therefore, to check for duplicates we also take into account the field ``diseaseFromSource``, which is the string describing the phenotype within ClinVar records (and is never missing in any evidence string). +## 2. Manual follow-up actions + ### Update summary metrics After the evidence strings have been generated, summary metrics need to be updated in the Google Sheets [table](https://docs.google.com/spreadsheets/d/1g_4tHNWP4VIikH7Jb0ui5aNr0PiFgvscZYOe69g191k/) on the “Raw statistics” sheet. diff --git a/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py b/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py index 00ad502a..5f4ca7e4 100644 --- a/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py +++ b/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py @@ -8,8 +8,6 @@ import jsonschema -from consequence_prediction.repeat_expansion_variants import pipeline as repeat_pipeline -from consequence_prediction.structural_variants import pipeline as structural_pipeline from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io from eva_cttv_pipeline.evidence_string_generation import consequence_type as CT @@ -113,29 +111,20 @@ def validate_evidence_string(ev_string, ot_schema_contents): sys.exit(1) -def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out, - include_structural=False): +def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out): os.makedirs(dir_out, exist_ok=True) string_to_efo_mappings = load_efo_mapping(efo_mapping_file) - - repeat_consequences = repeat_pipeline.main(clinvar_xml_file) - if include_structural: - structural_consequences = structural_pipeline.main(clinvar_xml_file) - complex_consequences = CT.process_consequence_type_dataframes(repeat_consequences, structural_consequences) - else: - complex_consequences = CT.process_consequence_type_dataframes(repeat_consequences) - variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file, complex_consequences) + variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file) report = clinvar_to_evidence_strings( string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file, - output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), - include_structural=include_structural) + output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME)) print(report.collate_report()) report.write_unmapped_terms(dir_out) def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema, - output_evidence_strings, include_structural): + output_evidence_strings): report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings) ot_schema_contents = json.loads(open(ot_schema).read()) output_evidence_strings_file = open(output_evidence_strings, 'wt') @@ -160,7 +149,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele # origins, (2) EFO mappings, and (3) genes where the variant has effect. grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins) - consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings, include_structural) + consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings) grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names, string_to_efo_mappings) @@ -192,8 +181,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings grouped_allele_origins, grouped_diseases, consequence_types): disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id, - disease_mapped_efo_id, consequence_attributes, - include_structural=include_structural) + disease_mapped_efo_id, consequence_attributes) # Validate and immediately output the evidence string (not keeping everything in memory). validate_evidence_string(evidence_string, ot_schema_contents) @@ -219,7 +207,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings def generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id, disease_mapped_efo_id, - consequence_attributes, include_structural=False): + consequence_attributes): """Generates an evidence string based on ClinVar record and some additional attributes.""" is_somatic = allele_origins == ['somatic'] evidence_string = { @@ -268,7 +256,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea # required by the Open Targets JSON schema. 'diseaseFromSourceMappedId': disease_mapped_efo_id.split('/')[-1] if disease_mapped_efo_id else None, } - if include_structural and clinvar_record.measure.preferred_current_hgvs: + if clinvar_record.measure.preferred_current_hgvs: evidence_string['variantHgvsId'] = clinvar_record.measure.preferred_current_hgvs.text # Remove the attributes with empty values (either None or empty lists). @@ -276,7 +264,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea return evidence_string -def get_consequence_types(clinvar_record_measure, consequence_type_dict, include_structural=False): +def get_consequence_types(clinvar_record_measure, consequence_type_dict): """Returns the list of functional consequences for a given ClinVar record measure. This is the place where ClinVar records are paired with the information about gene and functional consequences. @@ -317,7 +305,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict, include return consequence_type_dict[coord_id] # If there's also no complete coordinates, pair using HGVS - if include_structural and clinvar_record_measure.preferred_current_hgvs: + if clinvar_record_measure.preferred_current_hgvs: hgvs_id = clinvar_record_measure.preferred_current_hgvs.text if hgvs_id in consequence_type_dict: consequences = consequence_type_dict[hgvs_id] diff --git a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py index d2b10379..f224f1d9 100644 --- a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py +++ b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py @@ -13,23 +13,6 @@ def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term): consequence_type_dict[variant_id].append(ConsequenceType(ensembl_gene_id, SoTerm(so_term))) -def process_consequence_type_dataframes(*dataframes): - """ - Return a dictionary of consequence information extracted from one or more dataframes. - Assumes all dataframes are in the same format. - """ - consequence_type_dict = defaultdict(list) - for consequences_dataframe in dataframes: - for row in consequences_dataframe.itertuples(): - variant_id = row[1] - ensembl_gene_id = row[2] - so_term = row[4] - - process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term) - - return consequence_type_dict - - def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None): """ Return a dictionary of consequence information extracted from the given file. @@ -88,7 +71,6 @@ class SoTerm(object): Represents a sequence ontology term belonging to a consequence type object. Holds information on accession and rank. """ - so_accession_name_dict = get_so_accession_dict() ranked_so_names_list = get_severity_ranking() diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf new file mode 100644 index 00000000..bda65227 --- /dev/null +++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf @@ -0,0 +1,266 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + + +def helpMessage() { + log.info""" + Generate ClinVar evidence strings for Open Targets. + + Params: + --batch_root Directory for current batch + --schema Open Targets JSON schema version + --clinvar ClinVar XML file (optional, will download latest if omitted) + """ +} + +params.help = null +params.batch_root = null +params.schema = null +params.clinvar = null + +if (params.help) { + exit 0, helpMessage() +} +if (!params.batch_root || !params.schema) { + exit 1, helpMessage() +} +batchRoot = params.batch_root + + +/* + * Main workflow. + */ +workflow { + if (params.clinvar != null) { + clinvarXml = Channel.fromPath(params.clinvar) + } else { + clinvarXml = downloadClinvar() + } + downloadJsonSchema() + + runSnpIndel(clinvarXml) + runRepeat(clinvarXml) + runStructural(clinvarXml) + combineConsequences(runSnpIndel.out.consequencesSnp, + runRepeat.out.consequencesRepeat, + runStructural.out.consequencesStructural) + + generateEvidence(clinvarXml, + downloadJsonSchema.out.jsonSchema, + combineConsequences.out.consequencesCombined) + checkDuplicates(generateEvidence.out.evidenceStrings) + + convertXrefs(clinvarXml) +} + +/* + * Download ClinVar data, using the most recent XML dump. + */ +process downloadClinvar { + output: + path "clinvar.xml.gz", emit: clinvarXml + + script: + """ + wget -O clinvar.xml.gz \ + https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz + """ +} + +/* + * Download the Open Targets JSON schema. + */ +process downloadJsonSchema { + output: + path "opentargets-${params.schema}.json", emit: jsonSchema + + script: + """ + wget -O opentargets-${params.schema}.json \ + https://raw.githubusercontent.com/opentargets/json_schema/${params.schema}/opentargets.json + """ +} + +/* + * Run simple variants (SNPs and other variants with complete coordinates) through VEP and map them + * to genes and functional consequences. + */ +process runSnpIndel { + clusterOptions "-o ${batchRoot}/logs/consequence_snp.out \ + -e ${batchRoot}/logs/consequence_snp.err" + + publishDir "${batchRoot}/gene_mapping", + overwrite: true, + mode: "copy", + pattern: "*.tsv" + + input: + path clinvarXml + + output: + path "consequences_snp.tsv", emit: consequencesSnp + + script: + """ + \${PYTHON_BIN} "\${CODE_ROOT}/consequence_prediction/extract_variants_for_vep.py" --clinvar-xml ${clinvarXml} \ + | sort -u \ + | parallel \ + --halt now,fail=1 `# If any job fails, kill the remaining ones immediately and report failure` \ + --pipe `# Input is read from STDIN and split by chunks` \ + -j 20 `# Number of concurrent workers` \ + -N 200 `# Number of records (lines) per worker` \ + --tmpdir . `# Store temporary files in the current directory to avoid /tmp overflow` \ + \${PYTHON_BIN} "\${CODE_ROOT}/consequence_prediction/vep_mapping_pipeline/consequence_mapping.py" \ + | sort -u > consequences_snp.tsv + """ +} + +/* + * Extract repeat expansion variants from ClinVar and map them to genes. + */ +process runRepeat { + clusterOptions "-o ${batchRoot}/logs/consequence_repeat.out \ + -e ${batchRoot}/logs/consequence_repeat.err" + + publishDir "${batchRoot}/gene_mapping", + overwrite: true, + mode: "copy", + pattern: "*.tsv" + + input: + path clinvarXml + + output: + path "consequences_repeat.tsv", emit: consequencesRepeat + + script: + """ + \${PYTHON_BIN} \${CODE_ROOT}/consequence_prediction/run_repeat_expansion_variants.py \ + --clinvar-xml ${clinvarXml} \ + --output-consequences consequences_repeat.tsv + + # create an empty file if nothing generated + [[ -f consequences_repeat.tsv ]] || touch consequences_repeat.tsv + """ +} + +/* + * Run consequence and gene mapping for structural variants (i.e. no complete coordinates and not + * known repeat expansions). + */ +process runStructural { + clusterOptions "-o ${batchRoot}/logs/consequence_structural.out \ + -e ${batchRoot}/logs/consequence_structural.err" + + publishDir "${batchRoot}/gene_mapping", + overwrite: true, + mode: "copy", + pattern: "*.tsv" + + input: + path clinvarXml + + output: + path "consequences_structural.tsv", emit: consequencesStructural + + script: + """ + \${PYTHON_BIN} \${CODE_ROOT}/consequence_prediction/run_structural_variants.py \ + --clinvar-xml ${clinvarXml} \ + --output-consequences consequences_structural.tsv + + # create an empty file if nothing generated + [[ -f consequences_structural.tsv ]] || touch consequences_structural.tsv + """ +} + +/* + * Unite results of consequence mapping. + */ +process combineConsequences { + input: + path consequencesSnp + path consequencesRepeat + path consequencesStructural + + output: + path "consequences_combined.tsv", emit: consequencesCombined + + script: + """ + cat ${consequencesRepeat} ${consequencesSnp} ${consequencesStructural} > consequences_combined.tsv + """ +} + +/* + * Generate the evidence strings for submission to Open Targets. + */ +process generateEvidence { + clusterOptions "-o ${batchRoot}/logs/evidence_string_generation.out \ + -e ${batchRoot}/logs/evidence_string_generation.err" + + publishDir "${batchRoot}/evidence_strings", + overwrite: true, + mode: "copy", + pattern: "*.json" + + input: + path clinvarXml + path jsonSchema + path consequenceMappings + + output: + path "evidence_strings.json", emit: evidenceStrings + + script: + """ + \${PYTHON_BIN} \${CODE_ROOT}/bin/evidence_string_generation.py \ + --clinvar-xml ${clinvarXml} \ + --efo-mapping \${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \ + --gene-mapping ${consequenceMappings} \ + --ot-schema ${jsonSchema} \ + --out . + """ +} + +/* + * Check that the generated evidence strings do not contain any duplicated evidence strings. + */ +process checkDuplicates { + input: + path evidenceStrings + + script: + """ + jq --arg sep \$'\t' -jr \ + '.datatypeId,\$sep,.studyId,\$sep,.targetFromSourceId,\$sep,.variantId,\$sep,.variantFunctionalConsequenceId,\$sep,.diseaseFromSourceMappedId,\$sep,.diseaseFromSource,"\n"' \ + ${evidenceStrings} | sort | uniq -d > duplicates.tsv + [[ ! -s duplicates.tsv ]] + """ +} + +/* + * Convert MedGen and OMIM cross-references into ZOOMA format. + */ +process convertXrefs { + clusterOptions "-o ${batchRoot}/logs/traits_to_zooma_format.out \ + -e ${batchRoot}/logs/traits_to_zooma_format.err" + + publishDir "${batchRoot}/clinvar", + overwrite: true, + mode: "copy", + pattern: "*.txt" + + input: + path clinvarXml + + output: + path "clinvar_xrefs.txt", emit: clinvarXrefs + + """ + \${PYTHON_BIN} \${CODE_ROOT}/bin/traits_to_zooma_format.py \ + --clinvar-xml ${clinvarXml} \ + --zooma-feedback clinvar_xrefs.txt + """ +} diff --git a/setup.py b/setup.py index 8ad34e3b..5d136262 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def get_requires(): setup(name='eva_cttv_pipeline', - version='2.5.4', + version='2.6.0', packages=find_packages(), install_requires=get_requires(), #! TBD: list as a dependency subpackage 'clinvar_xml_utils.clinvar_xml_utils.clinvar_xml_utils' diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/config.py b/tests/eva_cttv_pipeline/evidence_string_generation/config.py index c7d7593e..40192d6b 100644 --- a/tests/eva_cttv_pipeline/evidence_string_generation/config.py +++ b/tests/eva_cttv_pipeline/evidence_string_generation/config.py @@ -2,7 +2,7 @@ from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io -OT_SCHEMA_VERSION = "2.2.6" +OT_SCHEMA_VERSION = "2.2.8" test_dir = os.path.dirname(__file__) efo_mapping_file = os.path.join(test_dir, 'resources', 'string_to_ontology_mappings.tsv') diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_genetics_evidence_string.json b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_genetics_evidence_string.json index 47ea7401..8245a994 100644 --- a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_genetics_evidence_string.json +++ b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_genetics_evidence_string.json @@ -22,6 +22,7 @@ "studyId": "RCV000002127", "targetFromSourceId": "ENSG00000139988", "variantFunctionalConsequenceId": "SO_0001583", + "variantHgvsId": "NC_000014.9:g.67729209A>G", "variantId": "14_67729209_A_G", "variantRsId": "rs28940313" } \ No newline at end of file diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_multiple_names_evidence_string.json b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_multiple_names_evidence_string.json index 5740864d..a7a398f2 100644 --- a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_multiple_names_evidence_string.json +++ b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_multiple_names_evidence_string.json @@ -23,6 +23,7 @@ "studyId": "RCV000415158", "targetFromSourceId": "ENSG00000139988", "variantFunctionalConsequenceId": "SO_0001583", + "variantHgvsId": "NC_000007.14:g.94423102G>A", "variantId": "7_94423102_G_A", "variantRsId": "rs1057518967" } \ No newline at end of file diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_somatic_evidence_string.json b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_somatic_evidence_string.json index a42b3944..8dee46cf 100644 --- a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_somatic_evidence_string.json +++ b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_somatic_evidence_string.json @@ -22,6 +22,7 @@ "studyId": "RCV000002127", "targetFromSourceId": "ENSG00000139988", "variantFunctionalConsequenceId": "SO_0001583", + "variantHgvsId": "NC_000014.9:g.67729209A>G", "variantId": "14_67729209_A_G", "variantRsId": "rs28940313" } \ No newline at end of file diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar_to_evidence_strings.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar_to_evidence_strings.py index 39e73329..37dd45a5 100644 --- a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar_to_evidence_strings.py +++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar_to_evidence_strings.py @@ -130,14 +130,11 @@ def test_structural_variant_consequences(self): structural_crm = config.get_test_clinvar_record('test_structural_record.xml.gz').measure consequences = [CT.ConsequenceType('ENSG00000075151', CT.SoTerm('splice_polypyrimidine_tract_variant'))] consequence_dict = {structural_crm.preferred_current_hgvs.text: consequences} - - # only get consequences from HGVS if include_structural is True - assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, consequence_dict, False) == [] - assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, consequence_dict, True) == consequences + assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, consequence_dict) == consequences # don't get consequences if there are more than MAX_TARGET_GENES long_consequence_dict = {structural_crm.preferred_current_hgvs.text: consequences * (MAX_TARGET_GENES+1)} - assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, long_consequence_dict, True) == [] + assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, long_consequence_dict) == [] class TestGenerateEvidenceStringTest: diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_consequence_type.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_consequence_type.py index 0cfbd92e..7ecc591d 100644 --- a/tests/eva_cttv_pipeline/evidence_string_generation/test_consequence_type.py +++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_consequence_type.py @@ -25,18 +25,6 @@ def test_process_consequence_type_file_tsv(): assert consequence_type_dict["14:67729241:C:T"][0] == test_consequence_type -def test_process_consequence_type_dataframes(): - dataframe_1 = pd.DataFrame( - [('NC_000011.10:g.5226797_5226798insGCC', 'ENSG00000244734', 'HBB', 'coding_sequence_variant')], - columns=('VariantID', 'EnsemblGeneID', 'EnsemblGeneName', 'ConsequenceTerm')) - dataframe_2 = pd.DataFrame( - [('RCV001051772', 'ENSG00000130711', 'PRDM12', 'trinucleotide_repeat_expansion')], - columns=('1', '2', '3', '4')) # column names can be anything - consequence_type_dict = CT.process_consequence_type_dataframes(dataframe_1, dataframe_2) - assert consequence_type_dict['NC_000011.10:g.5226797_5226798insGCC'][0].ensembl_gene_id == 'ENSG00000244734' - assert consequence_type_dict['RCV001051772'][0].ensembl_gene_id == 'ENSG00000130711' - - def test_ensembl_so_term(): so_term = CT.SoTerm('stop_gained') assert so_term.accession == 'SO_0001587'