Merge pull request #337 from apriltuesday/issue-333

Convert evidence string generation to Nextflow
EBIvariation · Sep 5, 2022 · 0fd4fab · 0fd4fab
2 parents 791cf5d + a5ec644
commit 0fd4fab
Show file tree

Hide file tree

Showing 17 changed files with 343 additions and 119 deletions.
diff --git a/bin/evidence_string_generation.py b/bin/evidence_string_generation.py
@@ -9,12 +9,10 @@
 parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
 parser.add_argument('--ot-schema',    help='OpenTargets schema JSON',                required=True)
 parser.add_argument('--out',          help='Output directory',                       required=True)
-parser.add_argument('--include-structural', help='Use structural variants consequence prediction pipeline',
-                    action='store_true', default=False, required=False)
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     clinvar_to_evidence_strings.launch_pipeline(
         clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
-        ot_schema_file=args.ot_schema, dir_out=args.out, include_structural=args.include_structural)
+        ot_schema_file=args.ot_schema, dir_out=args.out)
diff --git a/consequence_prediction/run_repeat_expansion_variants.py b/consequence_prediction/run_repeat_expansion_variants.py
@@ -14,7 +14,7 @@
     help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
 )
 parser.add_argument(
-    '--output-dataframe', required=True,
+    '--output-dataframe', required=False,
     help='File to output full dataframe for subsequent analysis and debugging.'
 )
 args = parser.parse_args()

diff --git a/consequence_prediction/run_structural_variants.py b/consequence_prediction/run_structural_variants.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+"""A wrapper script for running the repeat expansion pipeline."""
+
+import argparse
+import structural_variants.pipeline
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--clinvar-xml', required=True,
+    help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
+)
+parser.add_argument(
+    '--output-consequences', required=True,
+    help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
+)
+
+args = parser.parse_args()
+structural_variants.pipeline.main(args.clinvar_xml, args.output_consequences)
diff --git a/consequence_prediction/structural_variants/pipeline.py b/consequence_prediction/structural_variants/pipeline.py
@@ -68,7 +68,16 @@ def get_vep_results(clinvar_xml):
     return vep_results
 
 
-def main(clinvar_xml):
+def generate_consequences_file(consequences, output_consequences):
+    """Output final table."""
+    if consequences.empty:
+        logger.info('There are no records ready for output')
+        return
+    # Write the consequences table. This is used by the main evidence string generation pipeline.
+    consequences.to_csv(output_consequences, sep='\t', index=False, header=False)
+
+
+def main(clinvar_xml, output_consequences=None):
     vep_results = get_vep_results(clinvar_xml)
     results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
     variant_data = []
@@ -82,4 +91,6 @@ def main(clinvar_xml):
     # Return as a dataframe to be compatible with repeat expansion pipeline
     consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID',
                                                        'EnsemblGeneName', 'ConsequenceTerm'))
+    if output_consequences is not None:
+        generate_consequences_file(consequences, output_consequences)
     return consequences
diff --git a/docs/build.md b/docs/build.md
@@ -52,6 +52,14 @@ export PYTHONPATH=${INSTALL_PATH}
 
 The installed Python version can then be called with either `python` or `python3`. You can also use either `pip` or `pip3` to install packages into this local distribution.
 
+## Nextflow installation
+
+The evidence string generation pipeline uses Nextflow, which itself relies on Java. You can install in the current directory as follows:
+```bash
+wget -qO- https://get.nextflow.io | bash
+```
+You can then include this in your `$PATH` variable if necessary, or invoke the executable directly.  For more details on installing Nextflow, see the [documentation](https://www.nextflow.io/docs/latest/getstarted.html).
+
 ## Deploying local OLS installation
 During the preparation of 2019_04 release, which had to be synchronized with EFO v3, OLS had to be deployed locally because the production deployment of OLS on www.ebi.ac.uk/ols only supported EFO v2 at the time. This can be done using the following command (substitute the image version as appropriate):
 

diff --git a/docs/environment.md b/docs/environment.md
@@ -1,9 +1,9 @@
 # Setting up the common environment
 
-1. Log in to the LSF cluster, where all data processing must take place.
+1. Log in to the LSF cluster (currently `codon`), where all data processing must take place.
 1. Using a `become` command, switch to a common EVA production user instead of your personal account.
 1. Adjust and execute the commands below. They will set up the environment, fetch and build the code. Notes:
-    - The first five variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md).
+    - The first six variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md).
     - By modifying the `GIT_REMOTE` and `GIT_BRANCH` variables, you can run an arbitrary version of the pipeline. This can be used for development and debugging. By default it will fetch the master branch from the main pipeline repository.
     - Running these commands will overwrite any local changes you had in the repository copy on the cluster.
 
@@ -17,6 +17,9 @@ export PYTHON_INSTALL_PATH=
 # Location of bcftools installation path
 export BCFTOOLS_INSTALL_PATH=
 
+# Location of Nextflow installation path
+export NEXTFLOW_INSTALL_PATH=
+
 # The directory where subdirectories for each batch will be created
 export BATCH_ROOT_BASE=
 
@@ -26,8 +29,8 @@ export FTP_PATH_BASE=
 # Base bsub command line for all commands.
 export BSUB_CMDLINE="bsub"
 
-# Setting up Python paths
-export PATH=${PYTHON_INSTALL_PATH}:${BCFTOOLS_INSTALL_PATH}:$PATH
+# Setting up paths
+export PATH=${PYTHON_INSTALL_PATH}:${BCFTOOLS_INSTALL_PATH}:${NEXTFLOW_INSTALL_PATH}:$PATH
 export PYTHONPATH=${PYTHON_INSTALL_PATH}
 
 # External service paths
@@ -44,4 +47,7 @@ source env/bin/activate
 python3 -m pip -q install --upgrade pip setuptools
 python3 -m pip -q install -r requirements.txt
 python3 setup.py install
+
+# Location of Python executable, pointing to the virtualenv
+export PYTHON_BIN=${CODE_ROOT}/env/bin/python
 ```
diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md
@@ -18,7 +18,7 @@ Next, set up the protocol-specific environment:
 export OT_RELEASE=YYYY-MM
 
 # Open Targets JSON schema version.
-export OT_SCHEMA_VERSION=2.2.6
+export OT_SCHEMA_VERSION=2.2.8
 ```
 
 ## 1. Process data
@@ -31,58 +31,15 @@ mkdir -p ${BATCH_ROOT}
 cd ${BATCH_ROOT}
 mkdir -p clinvar gene_mapping evidence_strings logs
 
-# Download ClinVar data. We always use the most recent XML dump, which contains all data for the release.
-wget \
-  -O ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
-  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz
-
-# Download the Open Targets JSON schema.
-wget \
-  -O ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \
-  https://raw.githubusercontent.com/opentargets/json_schema/${OT_SCHEMA_VERSION}/opentargets.json
-
-# Run ClinVar variants through VEP and map them to genes and functional consequences.
-${BSUB_CMDLINE} -K -M 10G \
-  -o ${BATCH_ROOT}/logs/consequence_vep.out \
-  -e ${BATCH_ROOT}/logs/consequence_vep.err \
-  bash ${CODE_ROOT}/consequence_prediction/run_consequence_mapping.sh \
-    ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
-    ${BATCH_ROOT}/gene_mapping/consequences_vep.tsv
-
-# Generate the evidence strings for submission to Open Targets.
-${BSUB_CMDLINE} -K -M 10G \
-  -o ${BATCH_ROOT}/logs/evidence_string_generation.out \
-  -e ${BATCH_ROOT}/logs/evidence_string_generation.err \
-  python3 ${CODE_ROOT}/bin/evidence_string_generation.py \
-    --clinvar-xml  ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
-    --efo-mapping  ${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \
-    --gene-mapping ${BATCH_ROOT}/gene_mapping/consequences_vep.tsv \
-    --ot-schema    ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \
-    --out          ${BATCH_ROOT}/evidence_strings/ \
-    --include-structural
-
-# Check that the generated evidence strings do not contain any duplicated evidence strings. 
-#    For every evidence string, we group the value of fields datatypeId, studyId, 
-#    targetFromSourceId, variantId, variantFunctionalConsequenceId and diseaseFromSourceMappedId, 
-#    all separated by tabs, sorted and saved at duplicates.tsv if found duplicated. 
-jq --arg sep $'\t' -jr \
-  '.datatypeId,$sep,.studyId,$sep,.targetFromSourceId,$sep,.variantId,$sep,.variantFunctionalConsequenceId,$sep,.diseaseFromSourceMappedId,$sep,.diseaseFromSource,"\n"' \
-  ${BATCH_ROOT}/evidence_strings/evidence_strings.json \
-  | sort | uniq -d > ${BATCH_ROOT}/evidence_strings/duplicates.tsv
-
-# Convert MedGen and OMIM cross-references into ZOOMA format.
-${BSUB_CMDLINE} -K \
-  -o ${BATCH_ROOT}/logs/traits_to_zooma_format.out \
-  -e ${BATCH_ROOT}/logs/traits_to_zooma_format.err \
-  python3 ${CODE_ROOT}/bin/traits_to_zooma_format.py \
-    --clinvar-xml    ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
-    --zooma-feedback ${BATCH_ROOT}/clinvar/clinvar_xrefs.txt
+# Run the nextflow pipeline, resuming execution of previous attempt if possible.
+nextflow run ${CODE_ROOT}/eva_cttv_pipeline/evidence_string_generation/pipeline.nf \
+  --batch_root ${BATCH_ROOT} \
+  --schema ${OT_SCHEMA_VERSION} \
+  -resume
 ```
 
-## 2. Manual follow-up actions
-
-### Check that generated evidence strings do not contain any duplicates
-The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the file `${BATCH_ROOT}/evidence_strings/duplicates.tsv` should be empty. Check that this is the case.
+### Note on duplication checks
+The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the automated pipeline should fail with an error if duplicates are detected.
 
 A repeated evidence string will have identical values for these five fields:
 * **datatypeId** - Identifier of the type of data we are associating, varying between somatic and non-somatic ClinVar records (*e.g.* ``somatic_mutation`` or ``genetic_association`` respectively). 
@@ -94,6 +51,8 @@ A repeated evidence string will have identical values for these five fields:
 
 Nevertheless, we also report evidence strings in which  ``diseaseFromSourceMappedId`` may be empty (``diseaseFromSourceMappedId: null``) - i.e. the phenotype has not been mapped to an ontology yet. Therefore, to check for duplicates we also take into account the field ``diseaseFromSource``, which is the string describing the phenotype within ClinVar records (and is never missing in any evidence string).
 
+## 2. Manual follow-up actions
+
 ### Update summary metrics
 After the evidence strings have been generated, summary metrics need to be updated in the Google Sheets [table](https://docs.google.com/spreadsheets/d/1g_4tHNWP4VIikH7Jb0ui5aNr0PiFgvscZYOe69g191k/) on the “Raw statistics” sheet.
 

diff --git a/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py b/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py
@@ -8,8 +8,6 @@
 
 import jsonschema
 
-from consequence_prediction.repeat_expansion_variants import pipeline as repeat_pipeline
-from consequence_prediction.structural_variants import pipeline as structural_pipeline
 from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
 from eva_cttv_pipeline.evidence_string_generation import consequence_type as CT
 
@@ -113,29 +111,20 @@ def validate_evidence_string(ev_string, ot_schema_contents):
         sys.exit(1)
 
 
-def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out,
-                    include_structural=False):
+def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out):
     os.makedirs(dir_out, exist_ok=True)
     string_to_efo_mappings = load_efo_mapping(efo_mapping_file)
-
-    repeat_consequences = repeat_pipeline.main(clinvar_xml_file)
-    if include_structural:
-        structural_consequences = structural_pipeline.main(clinvar_xml_file)
-        complex_consequences = CT.process_consequence_type_dataframes(repeat_consequences, structural_consequences)
-    else:
-        complex_consequences = CT.process_consequence_type_dataframes(repeat_consequences)
-    variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file, complex_consequences)
+    variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
 
     report = clinvar_to_evidence_strings(
         string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
-        output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME),
-        include_structural=include_structural)
+        output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
     print(report.collate_report())
     report.write_unmapped_terms(dir_out)
 
 
 def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
-                                output_evidence_strings, include_structural):
+                                output_evidence_strings):
     report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
     ot_schema_contents = json.loads(open(ot_schema).read())
     output_evidence_strings_file = open(output_evidence_strings, 'wt')
@@ -160,7 +149,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
         # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
         # origins, (2) EFO mappings, and (3) genes where the variant has effect.
         grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
-        consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings, include_structural)
+        consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
         grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
                                                          string_to_efo_mappings)
 
@@ -192,8 +181,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
                 grouped_allele_origins, grouped_diseases, consequence_types):
             disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
             evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
-                                                       disease_mapped_efo_id, consequence_attributes,
-                                                       include_structural=include_structural)
+                                                       disease_mapped_efo_id, consequence_attributes)
 
             # Validate and immediately output the evidence string (not keeping everything in memory).
             validate_evidence_string(evidence_string, ot_schema_contents)
@@ -219,7 +207,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
 
 
 def generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id, disease_mapped_efo_id,
-                             consequence_attributes, include_structural=False):
+                             consequence_attributes):
     """Generates an evidence string based on ClinVar record and some additional attributes."""
     is_somatic = allele_origins == ['somatic']
     evidence_string = {
@@ -268,15 +256,15 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
         # required by the Open Targets JSON schema.
         'diseaseFromSourceMappedId': disease_mapped_efo_id.split('/')[-1] if disease_mapped_efo_id else None,
     }
-    if include_structural and clinvar_record.measure.preferred_current_hgvs:
+    if clinvar_record.measure.preferred_current_hgvs:
         evidence_string['variantHgvsId'] = clinvar_record.measure.preferred_current_hgvs.text
 
     # Remove the attributes with empty values (either None or empty lists).
     evidence_string = {key: value for key, value in evidence_string.items() if value}
     return evidence_string
 
 
-def get_consequence_types(clinvar_record_measure, consequence_type_dict, include_structural=False):
+def get_consequence_types(clinvar_record_measure, consequence_type_dict):
     """Returns the list of functional consequences for a given ClinVar record measure.
 
     This is the place where ClinVar records are paired with the information about gene and functional consequences.
@@ -317,7 +305,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict, include
             return consequence_type_dict[coord_id]
 
     # If there's also no complete coordinates, pair using HGVS
-    if include_structural and clinvar_record_measure.preferred_current_hgvs:
+    if clinvar_record_measure.preferred_current_hgvs:
         hgvs_id = clinvar_record_measure.preferred_current_hgvs.text
         if hgvs_id in consequence_type_dict:
             consequences = consequence_type_dict[hgvs_id]

diff --git a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
@@ -13,23 +13,6 @@ def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term):
     consequence_type_dict[variant_id].append(ConsequenceType(ensembl_gene_id, SoTerm(so_term)))
 
 
-def process_consequence_type_dataframes(*dataframes):
-    """
-    Return a dictionary of consequence information extracted from one or more dataframes.
-    Assumes all dataframes are in the same format.
-    """
-    consequence_type_dict = defaultdict(list)
-    for consequences_dataframe in dataframes:
-        for row in consequences_dataframe.itertuples():
-            variant_id = row[1]
-            ensembl_gene_id = row[2]
-            so_term = row[4]
-
-            process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term)
-
-    return consequence_type_dict
-
-
 def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
     """
     Return a dictionary of consequence information extracted from the given file.
@@ -88,7 +71,6 @@ class SoTerm(object):
     Represents a sequence ontology term belonging to a consequence type object.
     Holds information on accession and rank.
     """
-
     so_accession_name_dict = get_so_accession_dict()
 
     ranked_so_names_list = get_severity_ranking()