diff --git a/bin/evaluation/map_genes.py b/bin/evaluation/map_genes.py
new file mode 100644
index 00000000..fcd6342c
--- /dev/null
+++ b/bin/evaluation/map_genes.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import argparse
+
+import pandas as pd
+
+from consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info
+from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
+
+
+def load_clinvar_data(clinvar_xml):
+ """Load ClinVar data, process for gene symbols and HGNC IDs, and return it as a Pandas dataframe.
+ Modified from similar functionality in the repeat expansion pipeline."""
+ variant_data = [] # To populate the return dataframe (see columns below)
+ for clinvar_record in clinvar_xml_io.ClinVarDataset(clinvar_xml):
+ # Skip a record if it does not contain variant information
+ if not clinvar_record.measure:
+ continue
+ measure = clinvar_record.measure
+
+ # Extract gene symbol(s). Here and below, dashes are sometimes assigned to be compatible with the variant
+ # summary format which was used previously.
+ gene_symbols = measure.preferred_gene_symbols
+ if not gene_symbols:
+ gene_symbols = ['-']
+
+ # Extract HGNC ID
+ hgnc_ids = measure.hgnc_ids
+ hgnc_id = hgnc_ids[0] if len(hgnc_ids) == 1 and len(gene_symbols) == 1 else '-'
+
+ # Append data strings (including unused columns for compatibility)
+ for gene_symbol in gene_symbols:
+ variant_data.append([
+ measure.preferred_or_other_name,
+ clinvar_record.accession,
+ gene_symbol,
+ hgnc_id,
+ None,
+ None
+ ])
+
+ variants = pd.DataFrame(variant_data, columns=('Name',
+ 'RCVaccession',
+ 'GeneSymbol',
+ 'HGNC_ID',
+ 'TranscriptID',
+ 'RepeatType'))
+
+ # Since the same record can have coordinates in multiple builds, it can be repeated. Remove duplicates
+ variants = variants.drop_duplicates()
+ return variants
+
+
+def main(clinvar_xml, output_file):
+ """Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV."""
+ variants = load_clinvar_data(clinvar_xml)
+ annotated_variants = annotate_ensembl_gene_info(variants)
+ annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Script to convert HGNC IDs and gene symbols in ClinVar to Ensembl')
+ parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file')
+ parser.add_argument('--output-file', required=True, help='File to output dataframe')
+ args = parser.parse_args()
+ main(args.clinvar_xml, args.output_file)
diff --git a/bin/evaluation/map_xrefs.py b/bin/evaluation/map_xrefs.py
new file mode 100644
index 00000000..55bb8066
--- /dev/null
+++ b/bin/evaluation/map_xrefs.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import multiprocessing
+from functools import lru_cache
+
+from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
+from eva_cttv_pipeline.trait_mapping.ols import build_ols_query
+from eva_cttv_pipeline.trait_mapping.utils import json_request
+
+
+@lru_cache
+def get_synonyms(db, iden):
+ """Find synonyms (replacement terms or exact matches) for this ontology identifier using OLS."""
+ synonyms = set()
+ ontology_uri = OntologyUri(iden, db)
+ url = build_ols_query(str(ontology_uri))
+ json_response = json_request(url)
+ if json_response and '_embedded' in json_response:
+ for term in json_response['_embedded']['terms']:
+ # Get only EFO terms (even if imported)
+ if term['ontology_name'] == 'efo':
+ synonyms.add(term['iri'])
+ # Check whether current term is obsolete
+ if term['is_obsolete'] and term['term_replaced_by']:
+ synonyms.add(term['term_replaced_by'])
+ # Also add exact matches
+ if 'exactMatch' in term['annotation']:
+ synonyms.update(term['annotation']['exactMatch'])
+
+ # Synonyms contains current EFO-included URIs, convert to DB:ID style
+ synonyms = {OntologyUri.uri_to_curie(s) for s in synonyms}
+ # Filter out Nones
+ synonyms = {s for s in synonyms if s is not None}
+
+ if synonyms:
+ return ontology_uri.curie, synonyms
+ # If no synonyms, just return the original identifier as is
+ return ontology_uri.curie, {ontology_uri.curie}
+
+
+def main(clinvar_xml, output_file):
+ """Load ClinVar XML, map trait xrefs identifiers to synonyms in OLS, and dump results to TSV."""
+ traits = set()
+ for record in clinvar_xml_io.ClinVarDataset(clinvar_xml):
+ for trait in record.traits_with_valid_names:
+ traits.update([(db, iden) for db, iden, _ in trait.current_efo_aligned_xrefs])
+
+ traits = list(traits)
+ process_pool = multiprocessing.Pool(processes=24)
+ annotated_traits = [
+ process_pool.apply(get_synonyms, args=(db, iden))
+ for db, iden in traits
+ ]
+ with open(output_file, 'w+') as f:
+ csv.writer(f, delimiter="\t").writerows(annotated_traits)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Script to map trait xrefs in ClinVar to synonyms in OLS')
+ parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file')
+ parser.add_argument('--output-file', required=True, help='File to output dataframe')
+ args = parser.parse_args()
+ main(args.clinvar_xml, args.output_file)
diff --git a/bin/generate_annotated_xml.py b/bin/generate_annotated_xml.py
index 9c7f9c25..83bf3c75 100644
--- a/bin/generate_annotated_xml.py
+++ b/bin/generate_annotated_xml.py
@@ -8,10 +8,12 @@
parser.add_argument('--efo-mapping', help='Disease string to ontology mappings', required=True)
parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
parser.add_argument('--output-xml', help='Output XML file', required=True)
+parser.add_argument('--eval-gene-file', help='Gene mappings for evaluation', required=False)
+parser.add_argument('--eval-xref-file', help='Trait xref mappings for evaluation', required=False)
if __name__ == '__main__':
args = parser.parse_args()
generate_annotated_clinvar_xml(
clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
- output_xml_file=args.output_xml)
+ output_xml_file=args.output_xml, eval_gene_file=args.eval_gene_file, eval_xref_file=args.eval_xref_file)
diff --git a/bin/trait_mapping/create_efo_table.py b/bin/trait_mapping/create_efo_table.py
index 561347c3..3df9dd79 100644
--- a/bin/trait_mapping/create_efo_table.py
+++ b/bin/trait_mapping/create_efo_table.py
@@ -4,6 +4,9 @@
import re
import requests
+from requests import HTTPError
+from retry import retry
+
# Name of ontology in OLS url, e. g. https://www.ebi.ac.uk/ols/ontologies/ordo/terms?iri=...
ontology_to_ols = {
'HP': 'hp',
@@ -49,15 +52,17 @@ def uri_to_curie(uri):
return uri.split('/')[-1].replace('#', '').replace('_', ':')
+@retry(HTTPError, tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def get_cross_references(curie):
"""Queries OxO to return the list of cross-references for a given term curie."""
url = oxo_url_template.format(curie=curie)
- response = requests.get(url).json()
- if '_embedded' not in response:
- print('Warning: OxO error for term {}. No cross-links will be available for this term. '
- 'See https://github.com/EBISPOT/OXO/issues/26'.format(curie))
- return []
- mappings = response['_embedded']['searchResults'][0]['mappingResponseList']
+ response = requests.get(url)
+ response.raise_for_status()
+ json_response = response.json()
+ if '_embedded' not in json_response:
+ raise ValueError('Warning: OxO error for term {}. No cross-links will be available for this term. '
+ 'See https://github.com/EBISPOT/OXO/issues/26'.format(curie))
+ mappings = json_response['_embedded']['searchResults'][0]['mappingResponseList']
return [m['curie'] for m in mappings]
@@ -79,7 +84,12 @@ def get_ols_details(ontology, term):
# Cross-references
term_curie = uri_to_curie(term)
xrefs = {}
- for x in get_cross_references(term_curie):
+ try:
+ oxo_xrefs = get_cross_references(term_curie)
+ except (HTTPError, ValueError) as e:
+ print('Warning: OxO error for term {}. No cross-links will be available for this term.'.format(term_curie))
+ oxo_xrefs = []
+ for x in oxo_xrefs:
xref_ontology, xref_id = re.split('[_:]', x)
xrefs.setdefault(xref_ontology, set()).add('{}:{}'.format(xref_ontology, xref_id))
diff --git a/bin/traits_to_zooma_format.py b/bin/traits_to_zooma_format.py
index ae4dffff..8b94765a 100644
--- a/bin/traits_to_zooma_format.py
+++ b/bin/traits_to_zooma_format.py
@@ -5,28 +5,7 @@
from time import gmtime, strftime
from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
-
-
-class OntologyUri:
- # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
- # some examples of how this looks like in ClinVar data.
- db_to_uri_conversion = {
- 'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}', #
- 'omim': lambda x: f'https://www.omim.org/entry/{x}', #
- 'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}', #
- 'mesh': lambda x: f'http://identifiers.org/mesh/{x}', #
- 'medgen': lambda x: f'http://identifiers.org/medgen/{x}', #
- #
- 'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
- }
-
- def __init__(self, id_, db):
- self.id_ = id_
- self.db = db
- self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)
-
- def __str__(self):
- return self.uri
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
def write_zooma_record(clinvar_acc, variant_id, trait_name, ontology_uri, date, outfile):
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
index 5cb29a04..1fdca089 100644
--- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
@@ -85,6 +85,16 @@ def nsv_id(self):
logger.warning(f'Found multiple NSV IDs for {self.clinvar_record}, this is not yet supported')
return None
+ @property
+ def existing_so_terms(self):
+ all_so_terms = set()
+ for attr_elem in find_elements(self.measure_xml, './AttributeSet'):
+ if find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]'):
+ for so_elem in find_elements(attr_elem, './XRef[@DB="Sequence Ontology"]'):
+ if 'providedBy' not in so_elem.attrib:
+ all_so_terms.add(so_elem.attrib['ID'])
+ return all_so_terms
+
@cached_property
def _hgvs_to_types(self):
"""
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py
index ed89e36d..60a0229e 100644
--- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py
@@ -17,6 +17,9 @@ class ClinVarTrait:
'reclassified - variant of unknown significance', 'see cases', 'variant of unknown significance'
}
+ # These database identifiers are directly importable into EFO
+ EFO_ALIGNED_ONTOLOGIES = {'Human Phenotype Ontology', 'EFO', 'Orphanet', 'MONDO'}
+
def __init__(self, trait_xml, clinvar_record):
self.trait_xml = trait_xml
self.clinvar_record = clinvar_record
@@ -83,3 +86,9 @@ def medgen_id(self):
else:
logger.warning(f'Multiple MedGen IDs for {self}: {medgen_ids}')
return medgen_ids[0]
+
+ @property
+ def current_efo_aligned_xrefs(self):
+ """Returns current EFO or EFO-aligned ids."""
+ return [(db, iden, status) for (db, iden, status) in self.xrefs
+ if status == 'current' and db in self.EFO_ALIGNED_ONTOLOGIES]
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py
new file mode 100644
index 00000000..6b7f162f
--- /dev/null
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py
@@ -0,0 +1,66 @@
+import logging
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class OntologyUri:
+ # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
+ # some examples of how this looks like in ClinVar data.
+ db_to_uri_conversion = {
+ 'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}', #
+ 'omim': lambda x: f'https://www.omim.org/entry/{x}', #
+ 'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}', #
+ 'mesh': lambda x: f'http://identifiers.org/mesh/{x}', #
+ 'medgen': lambda x: f'http://identifiers.org/medgen/{x}', #
+ #
+ 'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
+ #
+ 'hp': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
+ }
+
+ def __init__(self, id_, db):
+ self.id_ = id_
+ self.db = db if db.lower() != 'human phenotype ontology' else 'HP'
+ self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)
+
+ def __str__(self):
+ return self.uri
+
+ @property
+ def curie(self):
+ return self.uri_to_curie(self.uri)
+
+ @staticmethod
+ def uri_to_curie(uri):
+ """Convert an ontology uri to a DB:ID format."""
+ uri_db_to_curie_db = {
+ "ordo": "Orphanet",
+ "orphanet": "Orphanet",
+ "omim": "OMIM",
+ "efo": "EFO",
+ "hp": "HP",
+ "mondo": "MONDO",
+ }
+ if not any(x in uri.lower() for x in uri_db_to_curie_db.keys()):
+ return None
+ uri = uri.rstrip("/")
+ uri_list = uri.split("/")
+ if "identifiers.org" in uri:
+ db = uri_list[-2]
+ id_ = uri_list[-1]
+ elif "omim.org" in uri:
+ db = "OMIM"
+ id_ = uri_list[-1]
+ else:
+ last_component = uri_list[-1]
+ if ":" in last_component:
+ return last_component
+ elif "_" in last_component:
+ db, id_ = last_component.split("_")
+ else:
+ logger.warning(f"Could not convert URI to CURIE: {uri}")
+ return None
+ db = uri_db_to_curie_db[db.lower()]
+ return "{}:{}".format(db, id_)
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py
index 97f3c93c..9ece88fa 100644
--- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py
@@ -106,7 +106,7 @@ def parse_all_identifiers(clinvar_measure: ClinVarRecordMeasure):
Repeat type must be present if possible, transcript id is useful if present."""
# TODO should we prioritise these in some way?
all_names = [clinvar_measure.get_variant_name_or_hgvs()] +\
- [hgvs.text for hgvs in clinvar_measure.current_hgvs] +\
+ sorted([hgvs.text for hgvs in clinvar_measure.current_hgvs]) +\
clinvar_measure.all_names
for name in all_names:
repeat_type, transcript_id = infer_repeat_type_and_transcript_id(name)
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py
index a57251a4..3a7ddf3b 100644
--- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py
@@ -14,6 +14,7 @@ def parse_header_attributes(clinvar_xml):
if elem.tag == 'ReleaseSet':
attrib = elem.attrib
break
+ elem.clear()
if not attrib:
return {}
# Resolve xsi:noNamespaceSchemaLocation="...", which is parsed strangely by ElementTree
diff --git a/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py b/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py
new file mode 100644
index 00000000..9175fdc1
--- /dev/null
+++ b/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py
@@ -0,0 +1,11 @@
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
+
+
+def test_uri_to_curie():
+ assert OntologyUri.uri_to_curie('http://www.orpha.net/ORDO/Orphanet_713') == 'Orphanet:713'
+ assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/HP_0002930') == 'HP:0002930'
+ assert OntologyUri.uri_to_curie('https://omim.org/entry/300377') == 'OMIM:300377'
+ # If for some reason we're already passed a CURIE (probably a mistake in OLS response), return it as-is
+ assert OntologyUri.uri_to_curie('HP:0002505') == 'HP:0002505'
+ # Not a supported db
+ assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/DOID_10652') == None
diff --git a/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py b/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py
index 5c374250..fd9833aa 100644
--- a/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py
+++ b/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py
@@ -1,4 +1,8 @@
+import re
import xml.etree.ElementTree as ET
+from collections import defaultdict
+
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io import ClinVarTrait, ClinVarRecordMeasure, ClinVarDataset, \
ClinVarRecord
@@ -6,7 +10,7 @@
from eva_cttv_pipeline.evidence_string_generation.clinvar_to_evidence_strings import load_efo_mapping, \
get_consequence_types
from eva_cttv_pipeline.evidence_string_generation import consequence_type as CT
-
+from eva_cttv_pipeline.evidence_string_generation.set_metrics import SetComparisonMetrics
PROCESSOR = 'CMAT'
@@ -15,29 +19,110 @@ class AnnotatingClinVarDataset(ClinVarDataset):
"""This class provides the ability to parse ClinVar records (RCVs) and annotate them with EFO mappings and
consequence mappings on the fly."""
- def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings):
+ def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings,
+ eval_gene_mappings=None, eval_xref_mappings=None):
super().__init__(clinvar_xml)
self.header_attr['ProcessedBy'] = PROCESSOR
self.string_to_efo_mappings = string_to_efo_mappings
self.variant_to_gene_mappings = variant_to_gene_mappings
+ self.eval_gene_mappings = eval_gene_mappings
+ self.eval_xref_mappings = eval_xref_mappings
+ self.overall_counts = {}
+ self.gene_metrics = None
+ self.conseq_metrics = None
+ self.trait_metrics = None
def __iter__(self):
+ # Initialise counts
+ self.overall_counts = {
+ 'total': 0,
+ 'has_supported_measure': 0,
+ 'has_supported_trait': 0,
+ }
+ self.gene_metrics = SetComparisonMetrics()
+ self.conseq_metrics = SetComparisonMetrics()
+ self.trait_metrics = SetComparisonMetrics()
+
for rcv in iterate_rcv_from_xml(self.clinvar_xml):
record = AnnotatedClinVarRecord(rcv)
self.annotate(record)
yield record
+ # Finalise counts - computes averages, etc.
+ self.gene_metrics.finalise()
+ self.conseq_metrics.finalise()
+ self.trait_metrics.finalise()
+
def annotate(self, record):
+ self.overall_counts['total'] += 1
# Functional consequences for measure
if record.measure:
- consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings)
- record.measure.add_ensembl_annotations(consequence_types)
- # EFO terms for trait
+ self.overall_counts['has_supported_measure'] += 1
+ self.annotate_and_count_measure(record)
+ # EFO terms for traits
+ if record.traits_with_valid_names:
+ self.overall_counts['has_supported_trait'] += 1
+ self.annotate_and_count_traits(record)
+
+ def annotate_and_count_measure(self, record):
+ consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings)
+ record.measure.add_ensembl_annotations(consequence_types)
+
+ annotated_genes = {ct.ensembl_gene_id for ct in consequence_types if ct.ensembl_gene_id}
+ annotated_conseqs = {EnsemblAnnotatedClinVarMeasure.format_so_term(ct.so_term)
+ for ct in consequence_types if ct.so_term}
+
+ if self.eval_gene_mappings:
+ existing_ensembl_ids = self.eval_gene_mappings.get(record.accession, [])
+ self.gene_metrics.count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
+ self.conseq_metrics.count_and_score(cv_set=record.measure.existing_so_terms, cmat_set=annotated_conseqs)
+
+ def annotate_and_count_traits(self, record):
for trait in record.traits_with_valid_names:
+ existing_efo_ids = set()
+ annotated_efo_ids = set()
efo_ids = []
for trait_name in trait.all_names:
- efo_ids.extend(efo_id for efo_id, efo_label in self.string_to_efo_mappings.get(trait_name.lower(), []))
+ efo_ids.extend(
+ EfoMappedClinVarTrait.format_efo_id(efo_id)
+ for efo_id, efo_label in self.string_to_efo_mappings.get(trait_name.lower(), []))
+
+ for db, iden, _ in trait.current_efo_aligned_xrefs:
+ curie = OntologyUri(iden, db).curie
+ if curie:
+ existing_efo_ids.add(curie)
trait.add_efo_mappings(efo_ids)
+ if self.eval_xref_mappings:
+ for efo_id in efo_ids:
+ # Attempt to match to an ID in ClinVar based on synonyms - if our ID is in the list of synonyms for
+ # a ClinVar ID, we use the synonymous ClinVar ID for comparison.
+ for cv_id in existing_efo_ids:
+ if efo_id in self.eval_xref_mappings.get(cv_id, {}):
+ annotated_efo_ids.add(cv_id)
+ # If didn't find anything, just use our ID, which will count as not matching.
+ if not annotated_efo_ids:
+ annotated_efo_ids.add(efo_id)
+
+ self.trait_metrics.count_and_score(cv_set=existing_efo_ids, cmat_set=annotated_efo_ids)
+
+ def report(self):
+ print('\nOverall counts (RCVs):')
+ self.print_counter(self.overall_counts)
+ if self.eval_gene_mappings:
+ print('\nGene annotations:')
+ self.gene_metrics.report()
+ print('\nFunctional consequences:')
+ self.conseq_metrics.report()
+ if self.eval_xref_mappings:
+ print('\nTrait mappings:')
+ self.trait_metrics.report()
+ print()
+
+ @staticmethod
+ def print_counter(counter):
+ max_len = len(max(counter.keys(), key=lambda x: len(x)))
+ for k, v in counter.items():
+ print(f'{k: <{max_len}} {v}')
class AnnotatedClinVarRecord(ClinVarRecord):
@@ -51,11 +136,18 @@ class EfoMappedClinVarTrait(ClinVarTrait):
def add_efo_mappings(self, efo_ids):
efo_elts = []
for efo_id in efo_ids:
- if efo_id.startswith('http'):
- efo_id = efo_id.split('/')[-1].replace('_', ':')
- efo_elts.append(ET.Element('XRef', attrib={'ID': efo_id, 'DB': 'EFO', 'providedBy': PROCESSOR}))
+ efo_id = self.format_efo_id(efo_id)
+ # Include Status attribute so this isn't included among current xrefs
+ efo_elts.append(ET.Element('XRef', attrib={
+ 'ID': efo_id, 'DB': 'EFO', 'Status': 'annotated', 'providedBy': PROCESSOR}))
self.trait_xml.extend(efo_elts)
+ @staticmethod
+ def format_efo_id(efo_id):
+ if efo_id.startswith('http'):
+ return efo_id.split('/')[-1].replace('_', ':')
+ return efo_id
+
class EnsemblAnnotatedClinVarMeasure(ClinVarRecordMeasure):
@@ -65,19 +157,54 @@ def add_ensembl_annotations(self, consequences):
attr_set_elt = ET.Element('AttributeSet')
attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence', 'providedBy': PROCESSOR})
attribute_elt.text = consequence_attributes.so_term.so_name.replace('_', ' ')
- so_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.so_term.accession.replace('_', ':'),
+ so_elt = ET.Element('XRef', attrib={'ID': self.format_so_term(consequence_attributes.so_term),
'DB': 'Sequence Ontology'})
ensembl_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_gene_id, 'DB': 'Ensembl'})
attr_set_elt.extend((attribute_elt, so_elt, ensembl_elt))
consequence_elts.append(attr_set_elt)
self.measure_xml.extend(consequence_elts)
+ @staticmethod
+ def format_so_term(so_term):
+ return so_term.accession.replace('_', ':')
+
-def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file):
+def load_evaluation_gene_mappings(input_path):
+ mapping = defaultdict(list)
+ with open(input_path) as input_file:
+ for line in input_file:
+ cols = line.strip().split('\t')
+ if len(cols) != 2:
+ continue
+ mapping[cols[0]].append(cols[1])
+ return mapping
+
+
+def load_evaluation_xref_mappings(input_path):
+ mapping = {}
+ with open(input_path) as input_file:
+ for line in input_file:
+ cols = line.strip().split('\t')
+ if len(cols) != 2:
+ continue
+ mapping[cols[0]] = re.sub(r"{|}|'", '', cols[1]).split(', ')
+ return mapping
+
+
+def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file,
+ eval_gene_file=None, eval_xref_file=None):
"""Generate an annotated XML file of ClinVar RCVs based on EFO mappings file and gene mapping file (as documented in
clinvar_to_evidence_strings)."""
string_to_efo_mappings = load_efo_mapping(efo_mapping_file)
variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
-
- dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings)
+ # Need both files to do an evaluation
+ if eval_gene_file and eval_xref_file:
+ eval_gene_mappings = load_evaluation_gene_mappings(eval_gene_file)
+ eval_xref_mappings = load_evaluation_xref_mappings(eval_xref_file)
+ dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings,
+ eval_gene_mappings=eval_gene_mappings,
+ eval_xref_mappings=eval_xref_mappings)
+ else:
+ dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings)
dataset.write(output_xml_file)
+ dataset.report()
diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
index 3f23d07d..f104683e 100644
--- a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
+++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
@@ -11,6 +11,8 @@ def helpMessage() {
--output_dir Directory for output
--schema Open Targets JSON schema version (optional, will output XML if omitted)
--clinvar ClinVar XML file (optional, will download latest if omitted)
+ --mappings Trait mappings file (optional, will use a default path if omitted)
+ --evaluate Whether to run evaluation or not (default false)
"""
}
@@ -18,6 +20,8 @@ params.help = null
params.output_dir = null
params.schema = null
params.clinvar = null
+params.mappings = '${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv'
+params.evaluate = false
if (params.help) {
exit 0, helpMessage()
@@ -58,7 +62,14 @@ workflow {
} else {
// Annotated ClinVar XML output
- generateAnnotatedXml(clinvarXml, combineConsequences.out.consequencesCombined)
+ if (params.evaluate) {
+ evalGeneMapping = mapGenes(clinvarXml)
+ evalXrefMapping = mapXrefs(clinvarXml)
+ } else {
+ evalGeneMapping = null
+ evalXrefMapping = null
+ }
+ generateAnnotatedXml(clinvarXml, combineConsequences.out.consequencesCombined, evalGeneMapping, evalXrefMapping)
}
}
@@ -201,6 +212,42 @@ process combineConsequences {
"""
}
+/*
+ * Map existing genes to Ensembl gene IDs. Currently used only for evaluation.
+ */
+process mapGenes {
+ input:
+ path clinvarXml
+
+ output:
+ path "output_gene_mappings.tsv", emit: outputGeneMappings
+
+ script:
+ """
+ \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_genes.py \
+ --clinvar-xml ${clinvarXml} \
+ --output-file output_gene_mappings.tsv
+ """
+}
+
+/*
+ * Map existing crossrefs to EFO-aligned IDs. Currently used only for evaluation.
+ */
+process mapXrefs {
+ input:
+ path clinvarXml
+
+ output:
+ path "output_xref_mappings.tsv", emit: outputXrefMappings
+
+ script:
+ """
+ \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_xrefs.py \
+ --clinvar-xml ${clinvarXml} \
+ --output-file output_xref_mappings.tsv
+ """
+}
+
/*
* Generate annotated ClinVar XML
*/
@@ -216,16 +263,21 @@ process generateAnnotatedXml {
input:
path clinvarXml
path consequenceMappings
+ path evalGeneMapping
+ path evalXrefMapping
output:
path "annotated_clinvar.xml.gz"
script:
+ def evalGeneFlag = evalGeneMapping != null? "--eval-gene-file ${evalGeneMapping}" : ""
+ def evalXrefFlag = evalXrefMapping != null? "--eval-xref-file ${evalXrefMapping}" : ""
"""
\${PYTHON_BIN} \${CODE_ROOT}/bin/generate_annotated_xml.py \
--clinvar-xml ${clinvarXml} \
- --efo-mapping \${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \
+ --efo-mapping ${params.mappings} \
--gene-mapping ${consequenceMappings} \
+ ${evalGeneFlag} ${evalXrefFlag} \
--output-xml annotated_clinvar.xml.gz
"""
}
diff --git a/eva_cttv_pipeline/evidence_string_generation/set_metrics.py b/eva_cttv_pipeline/evidence_string_generation/set_metrics.py
new file mode 100644
index 00000000..384c21bb
--- /dev/null
+++ b/eva_cttv_pipeline/evidence_string_generation/set_metrics.py
@@ -0,0 +1,91 @@
+class SetComparisonMetrics:
+ """Records counts and scores for comparing sets of arbitrary values from two sources (ClinVar and CMAT)."""
+
+ both_present_keys = [
+ 'exact_match',
+ 'cmat_superset',
+ 'cmat_subset',
+ 'divergent_match',
+ 'mismatch'
+ ]
+ some_missing_keys = [
+ 'cv_missing',
+ 'cmat_missing',
+ 'both_missing'
+ ]
+ all_keys = both_present_keys + some_missing_keys
+
+ def __init__(self):
+ self.counts = {k: 0 for k in self.all_keys}
+ self.scores = {k: 0 for k in self.all_keys}
+ # To keep counts in self.counts disjoint, add a separate field (calculated last) for both_present
+ self.both_present_count = 0
+ self.both_present_score = 0
+
+ def count_and_score(self, cv_set, cmat_set):
+ cv_set = set(cv_set)
+ cmat_set = set(cmat_set)
+
+ # First check if either set is empty - if so we count these but don't compute f-scores, as we do not assume
+ # either ClinVar or CMAT has complete coverage of the data.
+ if not cv_set and cmat_set:
+ self.counts['cv_missing'] += 1
+ elif cv_set and not cmat_set:
+ self.counts['cmat_missing'] += 1
+ elif not cv_set and not cmat_set:
+ self.counts['both_missing'] += 1
+
+ # Both sets present, compute the score and place in correct category.
+ else:
+ f1_score, true_pos, false_pos, false_neg = self.get_f1_tp_fp_fn(cv_set, cmat_set)
+ if false_pos and not false_neg:
+ k = 'cmat_superset'
+ elif not false_pos and false_neg:
+ k = 'cmat_subset'
+ elif not false_pos and not false_neg:
+ k = 'exact_match'
+ elif true_pos:
+ k = 'divergent_match'
+ else:
+ k = 'mismatch'
+ self.counts[k] += 1
+ self.scores[k] += f1_score
+
+ def finalise(self):
+ """Averages scores over counts, and also calculate both_present count & score."""
+ self.both_present_count = sum(self.counts[k] for k in self.both_present_keys)
+ if self.both_present_count:
+ self.both_present_score = sum(self.scores[k] for k in self.both_present_keys) / self.both_present_count
+ for k in self.all_keys:
+ self.scores[k] = self.scores[k] / self.counts[k] if self.counts[k] else 0
+
+ def report(self):
+ total = sum(v for v in self.counts.values())
+ print(f'Total = {total}')
+ self.pretty_print(
+ ('Category', 'Count', 'Percent', 'F1 Score'),
+ [(k, self.counts[k], f'{self.counts[k] / total:.1%}', f'{self.scores[k]:.2f}') for k in self.both_present_keys]
+ + [('=> both_present', self.both_present_count, f'{self.both_present_count / total:.1%}', f'{self.both_present_score:.2f}')]
+ + [(k, self.counts[k], f'{self.counts[k] / total:.1%}', f'{self.scores[k]:.2f}') for k in self.some_missing_keys]
+ )
+
+ @staticmethod
+ def get_f1_tp_fp_fn(cv_set, cmat_set):
+ """Returns f1-score and number of true positives, false positives and false negatives."""
+ if len(cv_set) == 0 and len(cmat_set) == 0:
+ return 0, 0, 0, 0
+ tp = len(cmat_set & cv_set)
+ fp = len(cmat_set - cv_set)
+ fn = len(cv_set - cmat_set)
+ return 2 * tp / (2 * tp + fp + fn), tp, fp, fn
+
+ @staticmethod
+ def pretty_print(header, table):
+ cell_widths = [len(h) for h in header]
+ for row in table:
+ for i, cell in enumerate(row):
+ cell_widths[i] = max(cell_widths[i], len(str(cell)))
+ format_string = ' '.join('{%s:>%s}' % (i, w) for i, w in enumerate(cell_widths))
+ print(' ' + format_string.format(*header) + ' ')
+ for row in table:
+ print(' ' + format_string.format(*row) + ' ')
diff --git a/setup.py b/setup.py
index cc8e9cb0..1a675e09 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ def get_requires():
setup(name='eva_cttv_pipeline',
- version='2.7.4',
+ version='2.8.0',
packages=find_packages(),
install_requires=get_requires(),
#! TBD: list as a dependency subpackage 'clinvar_xml_utils.clinvar_xml_utils.clinvar_xml_utils'
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz
index e298e742..f793609a 100644
Binary files a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz and b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz differ
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py
index a5a80cab..1a7f0498 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py
@@ -40,6 +40,9 @@ def test_allele_origins(self):
def test_valid_allele_origins(self):
assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'}
+ def test_trait_efo_ids(self):
+ assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')]
+
class TestClinvarRecordMeasure:
@classmethod
@@ -71,3 +74,6 @@ def test_variant_type(self):
def test_measure_set_pubmed_refs(self):
assert self.test_crm.pubmed_refs == []
+
+ def test_so_terms(self):
+ assert self.test_crm.existing_so_terms == {'SO:0001583'}
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py
new file mode 100644
index 00000000..fab0b88f
--- /dev/null
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py
@@ -0,0 +1,39 @@
+from eva_cttv_pipeline.evidence_string_generation.set_metrics import SetComparisonMetrics
+
+
+# Tolerance for float comparisons
+EPSILON = 1e-6
+
+
+def test_f1_score():
+ assert abs(SetComparisonMetrics.get_f1_tp_fp_fn({1, 2}, {1})[0] - 2 / 3) < EPSILON
+ assert abs(SetComparisonMetrics.get_f1_tp_fp_fn({1, 2}, {2, 1})[0] - 1) < EPSILON
+ assert SetComparisonMetrics.get_f1_tp_fp_fn({}, {})[0] == 0
+
+
+def test_count_and_score():
+ metrics = SetComparisonMetrics()
+ metrics.count_and_score({}, {})
+ metrics.count_and_score({1, 2}, {1})
+ metrics.count_and_score({1, 2}, {2, 1})
+ metrics.count_and_score({1, 2}, {2, 3})
+ metrics.count_and_score({1, 2}, {2, 3, 4})
+ metrics.count_and_score({1}, {2})
+ metrics.count_and_score([], {2, 3})
+ metrics.finalise()
+
+ assert metrics.counts == {
+ 'exact_match': 1,
+ 'cmat_superset': 0,
+ 'cmat_subset': 1,
+ 'divergent_match': 2,
+ 'mismatch': 1,
+ 'cv_missing': 1,
+ 'cmat_missing': 0,
+ 'both_missing': 1,
+ }
+
+ assert metrics.scores['exact_match'] == 1
+ assert metrics.scores['mismatch'] == 0
+ # mean(2/(2+1+1), 2/(2+2+1)) = mean(0.5, 0.4) = 0.45
+ assert abs(metrics.scores['divergent_match'] - 0.45) < EPSILON