diff --git a/bin/evaluation/map_genes.py b/bin/evaluation/map_genes.py new file mode 100644 index 00000000..fcd6342c --- /dev/null +++ b/bin/evaluation/map_genes.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +import argparse + +import pandas as pd + +from consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info +from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io + + +def load_clinvar_data(clinvar_xml): + """Load ClinVar data, process for gene symbols and HGNC IDs, and return it as a Pandas dataframe. + Modified from similar functionality in the repeat expansion pipeline.""" + variant_data = [] # To populate the return dataframe (see columns below) + for clinvar_record in clinvar_xml_io.ClinVarDataset(clinvar_xml): + # Skip a record if it does not contain variant information + if not clinvar_record.measure: + continue + measure = clinvar_record.measure + + # Extract gene symbol(s). Here and below, dashes are sometimes assigned to be compatible with the variant + # summary format which was used previously. + gene_symbols = measure.preferred_gene_symbols + if not gene_symbols: + gene_symbols = ['-'] + + # Extract HGNC ID + hgnc_ids = measure.hgnc_ids + hgnc_id = hgnc_ids[0] if len(hgnc_ids) == 1 and len(gene_symbols) == 1 else '-' + + # Append data strings (including unused columns for compatibility) + for gene_symbol in gene_symbols: + variant_data.append([ + measure.preferred_or_other_name, + clinvar_record.accession, + gene_symbol, + hgnc_id, + None, + None + ]) + + variants = pd.DataFrame(variant_data, columns=('Name', + 'RCVaccession', + 'GeneSymbol', + 'HGNC_ID', + 'TranscriptID', + 'RepeatType')) + + # Since the same record can have coordinates in multiple builds, it can be repeated. Remove duplicates + variants = variants.drop_duplicates() + return variants + + +def main(clinvar_xml, output_file): + """Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV.""" + variants = load_clinvar_data(clinvar_xml) + annotated_variants = annotate_ensembl_gene_info(variants) + annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Script to convert HGNC IDs and gene symbols in ClinVar to Ensembl') + parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file') + parser.add_argument('--output-file', required=True, help='File to output dataframe') + args = parser.parse_args() + main(args.clinvar_xml, args.output_file) diff --git a/bin/evaluation/map_xrefs.py b/bin/evaluation/map_xrefs.py new file mode 100644 index 00000000..55bb8066 --- /dev/null +++ b/bin/evaluation/map_xrefs.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +import argparse +import csv +import multiprocessing +from functools import lru_cache + +from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io +from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri +from eva_cttv_pipeline.trait_mapping.ols import build_ols_query +from eva_cttv_pipeline.trait_mapping.utils import json_request + + +@lru_cache +def get_synonyms(db, iden): + """Find synonyms (replacement terms or exact matches) for this ontology identifier using OLS.""" + synonyms = set() + ontology_uri = OntologyUri(iden, db) + url = build_ols_query(str(ontology_uri)) + json_response = json_request(url) + if json_response and '_embedded' in json_response: + for term in json_response['_embedded']['terms']: + # Get only EFO terms (even if imported) + if term['ontology_name'] == 'efo': + synonyms.add(term['iri']) + # Check whether current term is obsolete + if term['is_obsolete'] and term['term_replaced_by']: + synonyms.add(term['term_replaced_by']) + # Also add exact matches + if 'exactMatch' in term['annotation']: + synonyms.update(term['annotation']['exactMatch']) + + # Synonyms contains current EFO-included URIs, convert to DB:ID style + synonyms = {OntologyUri.uri_to_curie(s) for s in synonyms} + # Filter out Nones + synonyms = {s for s in synonyms if s is not None} + + if synonyms: + return ontology_uri.curie, synonyms + # If no synonyms, just return the original identifier as is + return ontology_uri.curie, {ontology_uri.curie} + + +def main(clinvar_xml, output_file): + """Load ClinVar XML, map trait xrefs identifiers to synonyms in OLS, and dump results to TSV.""" + traits = set() + for record in clinvar_xml_io.ClinVarDataset(clinvar_xml): + for trait in record.traits_with_valid_names: + traits.update([(db, iden) for db, iden, _ in trait.current_efo_aligned_xrefs]) + + traits = list(traits) + process_pool = multiprocessing.Pool(processes=24) + annotated_traits = [ + process_pool.apply(get_synonyms, args=(db, iden)) + for db, iden in traits + ] + with open(output_file, 'w+') as f: + csv.writer(f, delimiter="\t").writerows(annotated_traits) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Script to map trait xrefs in ClinVar to synonyms in OLS') + parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file') + parser.add_argument('--output-file', required=True, help='File to output dataframe') + args = parser.parse_args() + main(args.clinvar_xml, args.output_file) diff --git a/bin/generate_annotated_xml.py b/bin/generate_annotated_xml.py index 9c7f9c25..83bf3c75 100644 --- a/bin/generate_annotated_xml.py +++ b/bin/generate_annotated_xml.py @@ -8,10 +8,12 @@ parser.add_argument('--efo-mapping', help='Disease string to ontology mappings', required=True) parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True) parser.add_argument('--output-xml', help='Output XML file', required=True) +parser.add_argument('--eval-gene-file', help='Gene mappings for evaluation', required=False) +parser.add_argument('--eval-xref-file', help='Trait xref mappings for evaluation', required=False) if __name__ == '__main__': args = parser.parse_args() generate_annotated_clinvar_xml( clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping, - output_xml_file=args.output_xml) + output_xml_file=args.output_xml, eval_gene_file=args.eval_gene_file, eval_xref_file=args.eval_xref_file) diff --git a/bin/trait_mapping/create_efo_table.py b/bin/trait_mapping/create_efo_table.py index 561347c3..3df9dd79 100644 --- a/bin/trait_mapping/create_efo_table.py +++ b/bin/trait_mapping/create_efo_table.py @@ -4,6 +4,9 @@ import re import requests +from requests import HTTPError +from retry import retry + # Name of ontology in OLS url, e. g. https://www.ebi.ac.uk/ols/ontologies/ordo/terms?iri=... ontology_to_ols = { 'HP': 'hp', @@ -49,15 +52,17 @@ def uri_to_curie(uri): return uri.split('/')[-1].replace('#', '').replace('_', ':') +@retry(HTTPError, tries=4, delay=2, backoff=1.2, jitter=(1, 3)) def get_cross_references(curie): """Queries OxO to return the list of cross-references for a given term curie.""" url = oxo_url_template.format(curie=curie) - response = requests.get(url).json() - if '_embedded' not in response: - print('Warning: OxO error for term {}. No cross-links will be available for this term. ' - 'See https://github.com/EBISPOT/OXO/issues/26'.format(curie)) - return [] - mappings = response['_embedded']['searchResults'][0]['mappingResponseList'] + response = requests.get(url) + response.raise_for_status() + json_response = response.json() + if '_embedded' not in json_response: + raise ValueError('Warning: OxO error for term {}. No cross-links will be available for this term. ' + 'See https://github.com/EBISPOT/OXO/issues/26'.format(curie)) + mappings = json_response['_embedded']['searchResults'][0]['mappingResponseList'] return [m['curie'] for m in mappings] @@ -79,7 +84,12 @@ def get_ols_details(ontology, term): # Cross-references term_curie = uri_to_curie(term) xrefs = {} - for x in get_cross_references(term_curie): + try: + oxo_xrefs = get_cross_references(term_curie) + except (HTTPError, ValueError) as e: + print('Warning: OxO error for term {}. No cross-links will be available for this term.'.format(term_curie)) + oxo_xrefs = [] + for x in oxo_xrefs: xref_ontology, xref_id = re.split('[_:]', x) xrefs.setdefault(xref_ontology, set()).add('{}:{}'.format(xref_ontology, xref_id)) diff --git a/bin/traits_to_zooma_format.py b/bin/traits_to_zooma_format.py index ae4dffff..8b94765a 100644 --- a/bin/traits_to_zooma_format.py +++ b/bin/traits_to_zooma_format.py @@ -5,28 +5,7 @@ from time import gmtime, strftime from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io - - -class OntologyUri: - # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with - # some examples of how this looks like in ClinVar data. - db_to_uri_conversion = { - 'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}', # - 'omim': lambda x: f'https://www.omim.org/entry/{x}', # - 'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}', # - 'mesh': lambda x: f'http://identifiers.org/mesh/{x}', # - 'medgen': lambda x: f'http://identifiers.org/medgen/{x}', # - # - 'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')), - } - - def __init__(self, id_, db): - self.id_ = id_ - self.db = db - self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_) - - def __str__(self): - return self.uri +from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri def write_zooma_record(clinvar_acc, variant_id, trait_name, ontology_uri, date, outfile): diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py index 5cb29a04..1fdca089 100644 --- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py +++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py @@ -85,6 +85,16 @@ def nsv_id(self): logger.warning(f'Found multiple NSV IDs for {self.clinvar_record}, this is not yet supported') return None + @property + def existing_so_terms(self): + all_so_terms = set() + for attr_elem in find_elements(self.measure_xml, './AttributeSet'): + if find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]'): + for so_elem in find_elements(attr_elem, './XRef[@DB="Sequence Ontology"]'): + if 'providedBy' not in so_elem.attrib: + all_so_terms.add(so_elem.attrib['ID']) + return all_so_terms + @cached_property def _hgvs_to_types(self): """ diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py index ed89e36d..60a0229e 100644 --- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py +++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py @@ -17,6 +17,9 @@ class ClinVarTrait: 'reclassified - variant of unknown significance', 'see cases', 'variant of unknown significance' } + # These database identifiers are directly importable into EFO + EFO_ALIGNED_ONTOLOGIES = {'Human Phenotype Ontology', 'EFO', 'Orphanet', 'MONDO'} + def __init__(self, trait_xml, clinvar_record): self.trait_xml = trait_xml self.clinvar_record = clinvar_record @@ -83,3 +86,9 @@ def medgen_id(self): else: logger.warning(f'Multiple MedGen IDs for {self}: {medgen_ids}') return medgen_ids[0] + + @property + def current_efo_aligned_xrefs(self): + """Returns current EFO or EFO-aligned ids.""" + return [(db, iden, status) for (db, iden, status) in self.xrefs + if status == 'current' and db in self.EFO_ALIGNED_ONTOLOGIES] diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py new file mode 100644 index 00000000..6b7f162f --- /dev/null +++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py @@ -0,0 +1,66 @@ +import logging + +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class OntologyUri: + # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with + # some examples of how this looks like in ClinVar data. + db_to_uri_conversion = { + 'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}', # + 'omim': lambda x: f'https://www.omim.org/entry/{x}', # + 'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}', # + 'mesh': lambda x: f'http://identifiers.org/mesh/{x}', # + 'medgen': lambda x: f'http://identifiers.org/medgen/{x}', # + # + 'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')), + # + 'hp': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')), + } + + def __init__(self, id_, db): + self.id_ = id_ + self.db = db if db.lower() != 'human phenotype ontology' else 'HP' + self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_) + + def __str__(self): + return self.uri + + @property + def curie(self): + return self.uri_to_curie(self.uri) + + @staticmethod + def uri_to_curie(uri): + """Convert an ontology uri to a DB:ID format.""" + uri_db_to_curie_db = { + "ordo": "Orphanet", + "orphanet": "Orphanet", + "omim": "OMIM", + "efo": "EFO", + "hp": "HP", + "mondo": "MONDO", + } + if not any(x in uri.lower() for x in uri_db_to_curie_db.keys()): + return None + uri = uri.rstrip("/") + uri_list = uri.split("/") + if "identifiers.org" in uri: + db = uri_list[-2] + id_ = uri_list[-1] + elif "omim.org" in uri: + db = "OMIM" + id_ = uri_list[-1] + else: + last_component = uri_list[-1] + if ":" in last_component: + return last_component + elif "_" in last_component: + db, id_ = last_component.split("_") + else: + logger.warning(f"Could not convert URI to CURIE: {uri}") + return None + db = uri_db_to_curie_db[db.lower()] + return "{}:{}".format(db, id_) diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py index 97f3c93c..9ece88fa 100644 --- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py +++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py @@ -106,7 +106,7 @@ def parse_all_identifiers(clinvar_measure: ClinVarRecordMeasure): Repeat type must be present if possible, transcript id is useful if present.""" # TODO should we prioritise these in some way? all_names = [clinvar_measure.get_variant_name_or_hgvs()] +\ - [hgvs.text for hgvs in clinvar_measure.current_hgvs] +\ + sorted([hgvs.text for hgvs in clinvar_measure.current_hgvs]) +\ clinvar_measure.all_names for name in all_names: repeat_type, transcript_id = infer_repeat_type_and_transcript_id(name) diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py index a57251a4..3a7ddf3b 100644 --- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py +++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py @@ -14,6 +14,7 @@ def parse_header_attributes(clinvar_xml): if elem.tag == 'ReleaseSet': attrib = elem.attrib break + elem.clear() if not attrib: return {} # Resolve xsi:noNamespaceSchemaLocation="...", which is parsed strangely by ElementTree diff --git a/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py b/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py new file mode 100644 index 00000000..9175fdc1 --- /dev/null +++ b/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py @@ -0,0 +1,11 @@ +from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri + + +def test_uri_to_curie(): + assert OntologyUri.uri_to_curie('http://www.orpha.net/ORDO/Orphanet_713') == 'Orphanet:713' + assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/HP_0002930') == 'HP:0002930' + assert OntologyUri.uri_to_curie('https://omim.org/entry/300377') == 'OMIM:300377' + # If for some reason we're already passed a CURIE (probably a mistake in OLS response), return it as-is + assert OntologyUri.uri_to_curie('HP:0002505') == 'HP:0002505' + # Not a supported db + assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/DOID_10652') == None diff --git a/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py b/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py index 5c374250..fd9833aa 100644 --- a/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py +++ b/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py @@ -1,4 +1,8 @@ +import re import xml.etree.ElementTree as ET +from collections import defaultdict + +from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io import ClinVarTrait, ClinVarRecordMeasure, ClinVarDataset, \ ClinVarRecord @@ -6,7 +10,7 @@ from eva_cttv_pipeline.evidence_string_generation.clinvar_to_evidence_strings import load_efo_mapping, \ get_consequence_types from eva_cttv_pipeline.evidence_string_generation import consequence_type as CT - +from eva_cttv_pipeline.evidence_string_generation.set_metrics import SetComparisonMetrics PROCESSOR = 'CMAT' @@ -15,29 +19,110 @@ class AnnotatingClinVarDataset(ClinVarDataset): """This class provides the ability to parse ClinVar records (RCVs) and annotate them with EFO mappings and consequence mappings on the fly.""" - def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings): + def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings, + eval_gene_mappings=None, eval_xref_mappings=None): super().__init__(clinvar_xml) self.header_attr['ProcessedBy'] = PROCESSOR self.string_to_efo_mappings = string_to_efo_mappings self.variant_to_gene_mappings = variant_to_gene_mappings + self.eval_gene_mappings = eval_gene_mappings + self.eval_xref_mappings = eval_xref_mappings + self.overall_counts = {} + self.gene_metrics = None + self.conseq_metrics = None + self.trait_metrics = None def __iter__(self): + # Initialise counts + self.overall_counts = { + 'total': 0, + 'has_supported_measure': 0, + 'has_supported_trait': 0, + } + self.gene_metrics = SetComparisonMetrics() + self.conseq_metrics = SetComparisonMetrics() + self.trait_metrics = SetComparisonMetrics() + for rcv in iterate_rcv_from_xml(self.clinvar_xml): record = AnnotatedClinVarRecord(rcv) self.annotate(record) yield record + # Finalise counts - computes averages, etc. + self.gene_metrics.finalise() + self.conseq_metrics.finalise() + self.trait_metrics.finalise() + def annotate(self, record): + self.overall_counts['total'] += 1 # Functional consequences for measure if record.measure: - consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings) - record.measure.add_ensembl_annotations(consequence_types) - # EFO terms for trait + self.overall_counts['has_supported_measure'] += 1 + self.annotate_and_count_measure(record) + # EFO terms for traits + if record.traits_with_valid_names: + self.overall_counts['has_supported_trait'] += 1 + self.annotate_and_count_traits(record) + + def annotate_and_count_measure(self, record): + consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings) + record.measure.add_ensembl_annotations(consequence_types) + + annotated_genes = {ct.ensembl_gene_id for ct in consequence_types if ct.ensembl_gene_id} + annotated_conseqs = {EnsemblAnnotatedClinVarMeasure.format_so_term(ct.so_term) + for ct in consequence_types if ct.so_term} + + if self.eval_gene_mappings: + existing_ensembl_ids = self.eval_gene_mappings.get(record.accession, []) + self.gene_metrics.count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes) + self.conseq_metrics.count_and_score(cv_set=record.measure.existing_so_terms, cmat_set=annotated_conseqs) + + def annotate_and_count_traits(self, record): for trait in record.traits_with_valid_names: + existing_efo_ids = set() + annotated_efo_ids = set() efo_ids = [] for trait_name in trait.all_names: - efo_ids.extend(efo_id for efo_id, efo_label in self.string_to_efo_mappings.get(trait_name.lower(), [])) + efo_ids.extend( + EfoMappedClinVarTrait.format_efo_id(efo_id) + for efo_id, efo_label in self.string_to_efo_mappings.get(trait_name.lower(), [])) + + for db, iden, _ in trait.current_efo_aligned_xrefs: + curie = OntologyUri(iden, db).curie + if curie: + existing_efo_ids.add(curie) trait.add_efo_mappings(efo_ids) + if self.eval_xref_mappings: + for efo_id in efo_ids: + # Attempt to match to an ID in ClinVar based on synonyms - if our ID is in the list of synonyms for + # a ClinVar ID, we use the synonymous ClinVar ID for comparison. + for cv_id in existing_efo_ids: + if efo_id in self.eval_xref_mappings.get(cv_id, {}): + annotated_efo_ids.add(cv_id) + # If didn't find anything, just use our ID, which will count as not matching. + if not annotated_efo_ids: + annotated_efo_ids.add(efo_id) + + self.trait_metrics.count_and_score(cv_set=existing_efo_ids, cmat_set=annotated_efo_ids) + + def report(self): + print('\nOverall counts (RCVs):') + self.print_counter(self.overall_counts) + if self.eval_gene_mappings: + print('\nGene annotations:') + self.gene_metrics.report() + print('\nFunctional consequences:') + self.conseq_metrics.report() + if self.eval_xref_mappings: + print('\nTrait mappings:') + self.trait_metrics.report() + print() + + @staticmethod + def print_counter(counter): + max_len = len(max(counter.keys(), key=lambda x: len(x))) + for k, v in counter.items(): + print(f'{k: <{max_len}} {v}') class AnnotatedClinVarRecord(ClinVarRecord): @@ -51,11 +136,18 @@ class EfoMappedClinVarTrait(ClinVarTrait): def add_efo_mappings(self, efo_ids): efo_elts = [] for efo_id in efo_ids: - if efo_id.startswith('http'): - efo_id = efo_id.split('/')[-1].replace('_', ':') - efo_elts.append(ET.Element('XRef', attrib={'ID': efo_id, 'DB': 'EFO', 'providedBy': PROCESSOR})) + efo_id = self.format_efo_id(efo_id) + # Include Status attribute so this isn't included among current xrefs + efo_elts.append(ET.Element('XRef', attrib={ + 'ID': efo_id, 'DB': 'EFO', 'Status': 'annotated', 'providedBy': PROCESSOR})) self.trait_xml.extend(efo_elts) + @staticmethod + def format_efo_id(efo_id): + if efo_id.startswith('http'): + return efo_id.split('/')[-1].replace('_', ':') + return efo_id + class EnsemblAnnotatedClinVarMeasure(ClinVarRecordMeasure): @@ -65,19 +157,54 @@ def add_ensembl_annotations(self, consequences): attr_set_elt = ET.Element('AttributeSet') attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence', 'providedBy': PROCESSOR}) attribute_elt.text = consequence_attributes.so_term.so_name.replace('_', ' ') - so_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.so_term.accession.replace('_', ':'), + so_elt = ET.Element('XRef', attrib={'ID': self.format_so_term(consequence_attributes.so_term), 'DB': 'Sequence Ontology'}) ensembl_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_gene_id, 'DB': 'Ensembl'}) attr_set_elt.extend((attribute_elt, so_elt, ensembl_elt)) consequence_elts.append(attr_set_elt) self.measure_xml.extend(consequence_elts) + @staticmethod + def format_so_term(so_term): + return so_term.accession.replace('_', ':') + -def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file): +def load_evaluation_gene_mappings(input_path): + mapping = defaultdict(list) + with open(input_path) as input_file: + for line in input_file: + cols = line.strip().split('\t') + if len(cols) != 2: + continue + mapping[cols[0]].append(cols[1]) + return mapping + + +def load_evaluation_xref_mappings(input_path): + mapping = {} + with open(input_path) as input_file: + for line in input_file: + cols = line.strip().split('\t') + if len(cols) != 2: + continue + mapping[cols[0]] = re.sub(r"{|}|'", '', cols[1]).split(', ') + return mapping + + +def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file, + eval_gene_file=None, eval_xref_file=None): """Generate an annotated XML file of ClinVar RCVs based on EFO mappings file and gene mapping file (as documented in clinvar_to_evidence_strings).""" string_to_efo_mappings = load_efo_mapping(efo_mapping_file) variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file) - - dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings) + # Need both files to do an evaluation + if eval_gene_file and eval_xref_file: + eval_gene_mappings = load_evaluation_gene_mappings(eval_gene_file) + eval_xref_mappings = load_evaluation_xref_mappings(eval_xref_file) + dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings, + eval_gene_mappings=eval_gene_mappings, + eval_xref_mappings=eval_xref_mappings) + else: + dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings) dataset.write(output_xml_file) + dataset.report() diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf index 3f23d07d..f104683e 100644 --- a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf +++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf @@ -11,6 +11,8 @@ def helpMessage() { --output_dir Directory for output --schema Open Targets JSON schema version (optional, will output XML if omitted) --clinvar ClinVar XML file (optional, will download latest if omitted) + --mappings Trait mappings file (optional, will use a default path if omitted) + --evaluate Whether to run evaluation or not (default false) """ } @@ -18,6 +20,8 @@ params.help = null params.output_dir = null params.schema = null params.clinvar = null +params.mappings = '${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv' +params.evaluate = false if (params.help) { exit 0, helpMessage() @@ -58,7 +62,14 @@ workflow { } else { // Annotated ClinVar XML output - generateAnnotatedXml(clinvarXml, combineConsequences.out.consequencesCombined) + if (params.evaluate) { + evalGeneMapping = mapGenes(clinvarXml) + evalXrefMapping = mapXrefs(clinvarXml) + } else { + evalGeneMapping = null + evalXrefMapping = null + } + generateAnnotatedXml(clinvarXml, combineConsequences.out.consequencesCombined, evalGeneMapping, evalXrefMapping) } } @@ -201,6 +212,42 @@ process combineConsequences { """ } +/* + * Map existing genes to Ensembl gene IDs. Currently used only for evaluation. + */ +process mapGenes { + input: + path clinvarXml + + output: + path "output_gene_mappings.tsv", emit: outputGeneMappings + + script: + """ + \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_genes.py \ + --clinvar-xml ${clinvarXml} \ + --output-file output_gene_mappings.tsv + """ +} + +/* + * Map existing crossrefs to EFO-aligned IDs. Currently used only for evaluation. + */ +process mapXrefs { + input: + path clinvarXml + + output: + path "output_xref_mappings.tsv", emit: outputXrefMappings + + script: + """ + \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_xrefs.py \ + --clinvar-xml ${clinvarXml} \ + --output-file output_xref_mappings.tsv + """ +} + /* * Generate annotated ClinVar XML */ @@ -216,16 +263,21 @@ process generateAnnotatedXml { input: path clinvarXml path consequenceMappings + path evalGeneMapping + path evalXrefMapping output: path "annotated_clinvar.xml.gz" script: + def evalGeneFlag = evalGeneMapping != null? "--eval-gene-file ${evalGeneMapping}" : "" + def evalXrefFlag = evalXrefMapping != null? "--eval-xref-file ${evalXrefMapping}" : "" """ \${PYTHON_BIN} \${CODE_ROOT}/bin/generate_annotated_xml.py \ --clinvar-xml ${clinvarXml} \ - --efo-mapping \${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \ + --efo-mapping ${params.mappings} \ --gene-mapping ${consequenceMappings} \ + ${evalGeneFlag} ${evalXrefFlag} \ --output-xml annotated_clinvar.xml.gz """ } diff --git a/eva_cttv_pipeline/evidence_string_generation/set_metrics.py b/eva_cttv_pipeline/evidence_string_generation/set_metrics.py new file mode 100644 index 00000000..384c21bb --- /dev/null +++ b/eva_cttv_pipeline/evidence_string_generation/set_metrics.py @@ -0,0 +1,91 @@ +class SetComparisonMetrics: + """Records counts and scores for comparing sets of arbitrary values from two sources (ClinVar and CMAT).""" + + both_present_keys = [ + 'exact_match', + 'cmat_superset', + 'cmat_subset', + 'divergent_match', + 'mismatch' + ] + some_missing_keys = [ + 'cv_missing', + 'cmat_missing', + 'both_missing' + ] + all_keys = both_present_keys + some_missing_keys + + def __init__(self): + self.counts = {k: 0 for k in self.all_keys} + self.scores = {k: 0 for k in self.all_keys} + # To keep counts in self.counts disjoint, add a separate field (calculated last) for both_present + self.both_present_count = 0 + self.both_present_score = 0 + + def count_and_score(self, cv_set, cmat_set): + cv_set = set(cv_set) + cmat_set = set(cmat_set) + + # First check if either set is empty - if so we count these but don't compute f-scores, as we do not assume + # either ClinVar or CMAT has complete coverage of the data. + if not cv_set and cmat_set: + self.counts['cv_missing'] += 1 + elif cv_set and not cmat_set: + self.counts['cmat_missing'] += 1 + elif not cv_set and not cmat_set: + self.counts['both_missing'] += 1 + + # Both sets present, compute the score and place in correct category. + else: + f1_score, true_pos, false_pos, false_neg = self.get_f1_tp_fp_fn(cv_set, cmat_set) + if false_pos and not false_neg: + k = 'cmat_superset' + elif not false_pos and false_neg: + k = 'cmat_subset' + elif not false_pos and not false_neg: + k = 'exact_match' + elif true_pos: + k = 'divergent_match' + else: + k = 'mismatch' + self.counts[k] += 1 + self.scores[k] += f1_score + + def finalise(self): + """Averages scores over counts, and also calculate both_present count & score.""" + self.both_present_count = sum(self.counts[k] for k in self.both_present_keys) + if self.both_present_count: + self.both_present_score = sum(self.scores[k] for k in self.both_present_keys) / self.both_present_count + for k in self.all_keys: + self.scores[k] = self.scores[k] / self.counts[k] if self.counts[k] else 0 + + def report(self): + total = sum(v for v in self.counts.values()) + print(f'Total = {total}') + self.pretty_print( + ('Category', 'Count', 'Percent', 'F1 Score'), + [(k, self.counts[k], f'{self.counts[k] / total:.1%}', f'{self.scores[k]:.2f}') for k in self.both_present_keys] + + [('=> both_present', self.both_present_count, f'{self.both_present_count / total:.1%}', f'{self.both_present_score:.2f}')] + + [(k, self.counts[k], f'{self.counts[k] / total:.1%}', f'{self.scores[k]:.2f}') for k in self.some_missing_keys] + ) + + @staticmethod + def get_f1_tp_fp_fn(cv_set, cmat_set): + """Returns f1-score and number of true positives, false positives and false negatives.""" + if len(cv_set) == 0 and len(cmat_set) == 0: + return 0, 0, 0, 0 + tp = len(cmat_set & cv_set) + fp = len(cmat_set - cv_set) + fn = len(cv_set - cmat_set) + return 2 * tp / (2 * tp + fp + fn), tp, fp, fn + + @staticmethod + def pretty_print(header, table): + cell_widths = [len(h) for h in header] + for row in table: + for i, cell in enumerate(row): + cell_widths[i] = max(cell_widths[i], len(str(cell))) + format_string = ' '.join('{%s:>%s}' % (i, w) for i, w in enumerate(cell_widths)) + print(' ' + format_string.format(*header) + ' ') + for row in table: + print(' ' + format_string.format(*row) + ' ') diff --git a/setup.py b/setup.py index cc8e9cb0..1a675e09 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def get_requires(): setup(name='eva_cttv_pipeline', - version='2.7.4', + version='2.8.0', packages=find_packages(), install_requires=get_requires(), #! TBD: list as a dependency subpackage 'clinvar_xml_utils.clinvar_xml_utils.clinvar_xml_utils' diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz index e298e742..f793609a 100644 Binary files a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz and b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz differ diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py index a5a80cab..1a7f0498 100644 --- a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py +++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py @@ -40,6 +40,9 @@ def test_allele_origins(self): def test_valid_allele_origins(self): assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'} + def test_trait_efo_ids(self): + assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')] + class TestClinvarRecordMeasure: @classmethod @@ -71,3 +74,6 @@ def test_variant_type(self): def test_measure_set_pubmed_refs(self): assert self.test_crm.pubmed_refs == [] + + def test_so_terms(self): + assert self.test_crm.existing_so_terms == {'SO:0001583'} diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py new file mode 100644 index 00000000..fab0b88f --- /dev/null +++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py @@ -0,0 +1,39 @@ +from eva_cttv_pipeline.evidence_string_generation.set_metrics import SetComparisonMetrics + + +# Tolerance for float comparisons +EPSILON = 1e-6 + + +def test_f1_score(): + assert abs(SetComparisonMetrics.get_f1_tp_fp_fn({1, 2}, {1})[0] - 2 / 3) < EPSILON + assert abs(SetComparisonMetrics.get_f1_tp_fp_fn({1, 2}, {2, 1})[0] - 1) < EPSILON + assert SetComparisonMetrics.get_f1_tp_fp_fn({}, {})[0] == 0 + + +def test_count_and_score(): + metrics = SetComparisonMetrics() + metrics.count_and_score({}, {}) + metrics.count_and_score({1, 2}, {1}) + metrics.count_and_score({1, 2}, {2, 1}) + metrics.count_and_score({1, 2}, {2, 3}) + metrics.count_and_score({1, 2}, {2, 3, 4}) + metrics.count_and_score({1}, {2}) + metrics.count_and_score([], {2, 3}) + metrics.finalise() + + assert metrics.counts == { + 'exact_match': 1, + 'cmat_superset': 0, + 'cmat_subset': 1, + 'divergent_match': 2, + 'mismatch': 1, + 'cv_missing': 1, + 'cmat_missing': 0, + 'both_missing': 1, + } + + assert metrics.scores['exact_match'] == 1 + assert metrics.scores['mismatch'] == 0 + # mean(2/(2+1+1), 2/(2+2+1)) = mean(0.5, 0.4) = 0.45 + assert abs(metrics.scores['divergent_match'] - 0.45) < EPSILON