Skip to content

Commit

Permalink
Merge pull request #337 from apriltuesday/issue-333
Browse files Browse the repository at this point in the history
Convert evidence string generation to Nextflow
  • Loading branch information
apriltuesday authored Sep 5, 2022
2 parents 791cf5d + a5ec644 commit 0fd4fab
Show file tree
Hide file tree
Showing 17 changed files with 343 additions and 119 deletions.
4 changes: 1 addition & 3 deletions bin/evidence_string_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,10 @@
parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
parser.add_argument('--ot-schema', help='OpenTargets schema JSON', required=True)
parser.add_argument('--out', help='Output directory', required=True)
parser.add_argument('--include-structural', help='Use structural variants consequence prediction pipeline',
action='store_true', default=False, required=False)


if __name__ == '__main__':
args = parser.parse_args()
clinvar_to_evidence_strings.launch_pipeline(
clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
ot_schema_file=args.ot_schema, dir_out=args.out, include_structural=args.include_structural)
ot_schema_file=args.ot_schema, dir_out=args.out)
2 changes: 1 addition & 1 deletion consequence_prediction/run_repeat_expansion_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
)
parser.add_argument(
'--output-dataframe', required=True,
'--output-dataframe', required=False,
help='File to output full dataframe for subsequent analysis and debugging.'
)
args = parser.parse_args()
Expand Down
18 changes: 18 additions & 0 deletions consequence_prediction/run_structural_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env python3
"""A wrapper script for running the repeat expansion pipeline."""

import argparse
import structural_variants.pipeline

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--clinvar-xml', required=True,
help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
)
parser.add_argument(
'--output-consequences', required=True,
help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
)

args = parser.parse_args()
structural_variants.pipeline.main(args.clinvar_xml, args.output_consequences)
13 changes: 12 additions & 1 deletion consequence_prediction/structural_variants/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,16 @@ def get_vep_results(clinvar_xml):
return vep_results


def main(clinvar_xml):
def generate_consequences_file(consequences, output_consequences):
"""Output final table."""
if consequences.empty:
logger.info('There are no records ready for output')
return
# Write the consequences table. This is used by the main evidence string generation pipeline.
consequences.to_csv(output_consequences, sep='\t', index=False, header=False)


def main(clinvar_xml, output_consequences=None):
vep_results = get_vep_results(clinvar_xml)
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
variant_data = []
Expand All @@ -82,4 +91,6 @@ def main(clinvar_xml):
# Return as a dataframe to be compatible with repeat expansion pipeline
consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID',
'EnsemblGeneName', 'ConsequenceTerm'))
if output_consequences is not None:
generate_consequences_file(consequences, output_consequences)
return consequences
8 changes: 8 additions & 0 deletions docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ export PYTHONPATH=${INSTALL_PATH}

The installed Python version can then be called with either `python` or `python3`. You can also use either `pip` or `pip3` to install packages into this local distribution.

## Nextflow installation

The evidence string generation pipeline uses Nextflow, which itself relies on Java. You can install in the current directory as follows:
```bash
wget -qO- https://get.nextflow.io | bash
```
You can then include this in your `$PATH` variable if necessary, or invoke the executable directly. For more details on installing Nextflow, see the [documentation](https://www.nextflow.io/docs/latest/getstarted.html).

## Deploying local OLS installation
During the preparation of 2019_04 release, which had to be synchronized with EFO v3, OLS had to be deployed locally because the production deployment of OLS on www.ebi.ac.uk/ols only supported EFO v2 at the time. This can be done using the following command (substitute the image version as appropriate):

Expand Down
14 changes: 10 additions & 4 deletions docs/environment.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Setting up the common environment

1. Log in to the LSF cluster, where all data processing must take place.
1. Log in to the LSF cluster (currently `codon`), where all data processing must take place.
1. Using a `become` command, switch to a common EVA production user instead of your personal account.
1. Adjust and execute the commands below. They will set up the environment, fetch and build the code. Notes:
- The first five variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md).
- The first six variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md).
- By modifying the `GIT_REMOTE` and `GIT_BRANCH` variables, you can run an arbitrary version of the pipeline. This can be used for development and debugging. By default it will fetch the master branch from the main pipeline repository.
- Running these commands will overwrite any local changes you had in the repository copy on the cluster.

Expand All @@ -17,6 +17,9 @@ export PYTHON_INSTALL_PATH=
# Location of bcftools installation path
export BCFTOOLS_INSTALL_PATH=

# Location of Nextflow installation path
export NEXTFLOW_INSTALL_PATH=

# The directory where subdirectories for each batch will be created
export BATCH_ROOT_BASE=

Expand All @@ -26,8 +29,8 @@ export FTP_PATH_BASE=
# Base bsub command line for all commands.
export BSUB_CMDLINE="bsub"

# Setting up Python paths
export PATH=${PYTHON_INSTALL_PATH}:${BCFTOOLS_INSTALL_PATH}:$PATH
# Setting up paths
export PATH=${PYTHON_INSTALL_PATH}:${BCFTOOLS_INSTALL_PATH}:${NEXTFLOW_INSTALL_PATH}:$PATH
export PYTHONPATH=${PYTHON_INSTALL_PATH}

# External service paths
Expand All @@ -44,4 +47,7 @@ source env/bin/activate
python3 -m pip -q install --upgrade pip setuptools
python3 -m pip -q install -r requirements.txt
python3 setup.py install

# Location of Python executable, pointing to the virtualenv
export PYTHON_BIN=${CODE_ROOT}/env/bin/python
```
61 changes: 10 additions & 51 deletions docs/generate-evidence-strings.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Next, set up the protocol-specific environment:
export OT_RELEASE=YYYY-MM

# Open Targets JSON schema version.
export OT_SCHEMA_VERSION=2.2.6
export OT_SCHEMA_VERSION=2.2.8
```

## 1. Process data
Expand All @@ -31,58 +31,15 @@ mkdir -p ${BATCH_ROOT}
cd ${BATCH_ROOT}
mkdir -p clinvar gene_mapping evidence_strings logs

# Download ClinVar data. We always use the most recent XML dump, which contains all data for the release.
wget \
-O ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz

# Download the Open Targets JSON schema.
wget \
-O ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \
https://raw.githubusercontent.com/opentargets/json_schema/${OT_SCHEMA_VERSION}/opentargets.json

# Run ClinVar variants through VEP and map them to genes and functional consequences.
${BSUB_CMDLINE} -K -M 10G \
-o ${BATCH_ROOT}/logs/consequence_vep.out \
-e ${BATCH_ROOT}/logs/consequence_vep.err \
bash ${CODE_ROOT}/consequence_prediction/run_consequence_mapping.sh \
${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
${BATCH_ROOT}/gene_mapping/consequences_vep.tsv

# Generate the evidence strings for submission to Open Targets.
${BSUB_CMDLINE} -K -M 10G \
-o ${BATCH_ROOT}/logs/evidence_string_generation.out \
-e ${BATCH_ROOT}/logs/evidence_string_generation.err \
python3 ${CODE_ROOT}/bin/evidence_string_generation.py \
--clinvar-xml ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
--efo-mapping ${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \
--gene-mapping ${BATCH_ROOT}/gene_mapping/consequences_vep.tsv \
--ot-schema ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \
--out ${BATCH_ROOT}/evidence_strings/ \
--include-structural

# Check that the generated evidence strings do not contain any duplicated evidence strings.
# For every evidence string, we group the value of fields datatypeId, studyId,
# targetFromSourceId, variantId, variantFunctionalConsequenceId and diseaseFromSourceMappedId,
# all separated by tabs, sorted and saved at duplicates.tsv if found duplicated.
jq --arg sep $'\t' -jr \
'.datatypeId,$sep,.studyId,$sep,.targetFromSourceId,$sep,.variantId,$sep,.variantFunctionalConsequenceId,$sep,.diseaseFromSourceMappedId,$sep,.diseaseFromSource,"\n"' \
${BATCH_ROOT}/evidence_strings/evidence_strings.json \
| sort | uniq -d > ${BATCH_ROOT}/evidence_strings/duplicates.tsv

# Convert MedGen and OMIM cross-references into ZOOMA format.
${BSUB_CMDLINE} -K \
-o ${BATCH_ROOT}/logs/traits_to_zooma_format.out \
-e ${BATCH_ROOT}/logs/traits_to_zooma_format.err \
python3 ${CODE_ROOT}/bin/traits_to_zooma_format.py \
--clinvar-xml ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
--zooma-feedback ${BATCH_ROOT}/clinvar/clinvar_xrefs.txt
# Run the nextflow pipeline, resuming execution of previous attempt if possible.
nextflow run ${CODE_ROOT}/eva_cttv_pipeline/evidence_string_generation/pipeline.nf \
--batch_root ${BATCH_ROOT} \
--schema ${OT_SCHEMA_VERSION} \
-resume
```

## 2. Manual follow-up actions

### Check that generated evidence strings do not contain any duplicates
The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the file `${BATCH_ROOT}/evidence_strings/duplicates.tsv` should be empty. Check that this is the case.
### Note on duplication checks
The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the automated pipeline should fail with an error if duplicates are detected.

A repeated evidence string will have identical values for these five fields:
* **datatypeId** - Identifier of the type of data we are associating, varying between somatic and non-somatic ClinVar records (*e.g.* ``somatic_mutation`` or ``genetic_association`` respectively).
Expand All @@ -94,6 +51,8 @@ A repeated evidence string will have identical values for these five fields:

Nevertheless, we also report evidence strings in which ``diseaseFromSourceMappedId`` may be empty (``diseaseFromSourceMappedId: null``) - i.e. the phenotype has not been mapped to an ontology yet. Therefore, to check for duplicates we also take into account the field ``diseaseFromSource``, which is the string describing the phenotype within ClinVar records (and is never missing in any evidence string).

## 2. Manual follow-up actions

### Update summary metrics
After the evidence strings have been generated, summary metrics need to be updated in the Google Sheets [table](https://docs.google.com/spreadsheets/d/1g_4tHNWP4VIikH7Jb0ui5aNr0PiFgvscZYOe69g191k/) on the “Raw statistics” sheet.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

import jsonschema

from consequence_prediction.repeat_expansion_variants import pipeline as repeat_pipeline
from consequence_prediction.structural_variants import pipeline as structural_pipeline
from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
from eva_cttv_pipeline.evidence_string_generation import consequence_type as CT

Expand Down Expand Up @@ -113,29 +111,20 @@ def validate_evidence_string(ev_string, ot_schema_contents):
sys.exit(1)


def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out,
include_structural=False):
def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out):
os.makedirs(dir_out, exist_ok=True)
string_to_efo_mappings = load_efo_mapping(efo_mapping_file)

repeat_consequences = repeat_pipeline.main(clinvar_xml_file)
if include_structural:
structural_consequences = structural_pipeline.main(clinvar_xml_file)
complex_consequences = CT.process_consequence_type_dataframes(repeat_consequences, structural_consequences)
else:
complex_consequences = CT.process_consequence_type_dataframes(repeat_consequences)
variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file, complex_consequences)
variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)

report = clinvar_to_evidence_strings(
string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME),
include_structural=include_structural)
output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
print(report.collate_report())
report.write_unmapped_terms(dir_out)


def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
output_evidence_strings, include_structural):
output_evidence_strings):
report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
ot_schema_contents = json.loads(open(ot_schema).read())
output_evidence_strings_file = open(output_evidence_strings, 'wt')
Expand All @@ -160,7 +149,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
# origins, (2) EFO mappings, and (3) genes where the variant has effect.
grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings, include_structural)
consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

Expand Down Expand Up @@ -192,8 +181,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
grouped_allele_origins, grouped_diseases, consequence_types):
disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
disease_mapped_efo_id, consequence_attributes,
include_structural=include_structural)
disease_mapped_efo_id, consequence_attributes)

# Validate and immediately output the evidence string (not keeping everything in memory).
validate_evidence_string(evidence_string, ot_schema_contents)
Expand All @@ -219,7 +207,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings


def generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id, disease_mapped_efo_id,
consequence_attributes, include_structural=False):
consequence_attributes):
"""Generates an evidence string based on ClinVar record and some additional attributes."""
is_somatic = allele_origins == ['somatic']
evidence_string = {
Expand Down Expand Up @@ -268,15 +256,15 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
# required by the Open Targets JSON schema.
'diseaseFromSourceMappedId': disease_mapped_efo_id.split('/')[-1] if disease_mapped_efo_id else None,
}
if include_structural and clinvar_record.measure.preferred_current_hgvs:
if clinvar_record.measure.preferred_current_hgvs:
evidence_string['variantHgvsId'] = clinvar_record.measure.preferred_current_hgvs.text

# Remove the attributes with empty values (either None or empty lists).
evidence_string = {key: value for key, value in evidence_string.items() if value}
return evidence_string


def get_consequence_types(clinvar_record_measure, consequence_type_dict, include_structural=False):
def get_consequence_types(clinvar_record_measure, consequence_type_dict):
"""Returns the list of functional consequences for a given ClinVar record measure.
This is the place where ClinVar records are paired with the information about gene and functional consequences.
Expand Down Expand Up @@ -317,7 +305,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict, include
return consequence_type_dict[coord_id]

# If there's also no complete coordinates, pair using HGVS
if include_structural and clinvar_record_measure.preferred_current_hgvs:
if clinvar_record_measure.preferred_current_hgvs:
hgvs_id = clinvar_record_measure.preferred_current_hgvs.text
if hgvs_id in consequence_type_dict:
consequences = consequence_type_dict[hgvs_id]
Expand Down
18 changes: 0 additions & 18 deletions eva_cttv_pipeline/evidence_string_generation/consequence_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,6 @@ def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term):
consequence_type_dict[variant_id].append(ConsequenceType(ensembl_gene_id, SoTerm(so_term)))


def process_consequence_type_dataframes(*dataframes):
"""
Return a dictionary of consequence information extracted from one or more dataframes.
Assumes all dataframes are in the same format.
"""
consequence_type_dict = defaultdict(list)
for consequences_dataframe in dataframes:
for row in consequences_dataframe.itertuples():
variant_id = row[1]
ensembl_gene_id = row[2]
so_term = row[4]

process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term)

return consequence_type_dict


def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
"""
Return a dictionary of consequence information extracted from the given file.
Expand Down Expand Up @@ -88,7 +71,6 @@ class SoTerm(object):
Represents a sequence ontology term belonging to a consequence type object.
Holds information on accession and rank.
"""

so_accession_name_dict = get_so_accession_dict()

ranked_so_names_list = get_severity_ranking()
Expand Down
Loading

0 comments on commit 0fd4fab

Please sign in to comment.