Merge pull request #371 from apriltuesday/results-for-paper

Add counts to annotated XML generation
EBIvariation · Apr 26, 2023 · b1778c9 · b1778c9
2 parents 6d36caa + 18673c7
commit b1778c9
Show file tree

Hide file tree

Showing 18 changed files with 580 additions and 47 deletions.
diff --git a/bin/evaluation/map_genes.py b/bin/evaluation/map_genes.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import argparse
+
+import pandas as pd
+
+from consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info
+from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
+
+
+def load_clinvar_data(clinvar_xml):
+    """Load ClinVar data, process for gene symbols and HGNC IDs, and return it as a Pandas dataframe.
+     Modified from similar functionality in the repeat expansion pipeline."""
+    variant_data = []  # To populate the return dataframe (see columns below)
+    for clinvar_record in clinvar_xml_io.ClinVarDataset(clinvar_xml):
+        # Skip a record if it does not contain variant information
+        if not clinvar_record.measure:
+            continue
+        measure = clinvar_record.measure
+
+        # Extract gene symbol(s). Here and below, dashes are sometimes assigned to be compatible with the variant
+        # summary format which was used previously.
+        gene_symbols = measure.preferred_gene_symbols
+        if not gene_symbols:
+            gene_symbols = ['-']
+
+        # Extract HGNC ID
+        hgnc_ids = measure.hgnc_ids
+        hgnc_id = hgnc_ids[0] if len(hgnc_ids) == 1 and len(gene_symbols) == 1 else '-'
+
+        # Append data strings (including unused columns for compatibility)
+        for gene_symbol in gene_symbols:
+            variant_data.append([
+                measure.preferred_or_other_name,
+                clinvar_record.accession,
+                gene_symbol,
+                hgnc_id,
+                None,
+                None
+            ])
+
+    variants = pd.DataFrame(variant_data, columns=('Name',
+                                                   'RCVaccession',
+                                                   'GeneSymbol',
+                                                   'HGNC_ID',
+                                                   'TranscriptID',
+                                                   'RepeatType'))
+
+    # Since the same record can have coordinates in multiple builds, it can be repeated. Remove duplicates
+    variants = variants.drop_duplicates()
+    return variants
+
+
+def main(clinvar_xml, output_file):
+    """Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV."""
+    variants = load_clinvar_data(clinvar_xml)
+    annotated_variants = annotate_ensembl_gene_info(variants)
+    annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Script to convert HGNC IDs and gene symbols in ClinVar to Ensembl')
+    parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file')
+    parser.add_argument('--output-file', required=True, help='File to output dataframe')
+    args = parser.parse_args()
+    main(args.clinvar_xml, args.output_file)
diff --git a/bin/evaluation/map_xrefs.py b/bin/evaluation/map_xrefs.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import multiprocessing
+from functools import lru_cache
+
+from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
+from eva_cttv_pipeline.trait_mapping.ols import build_ols_query
+from eva_cttv_pipeline.trait_mapping.utils import json_request
+
+
+@lru_cache
+def get_synonyms(db, iden):
+    """Find synonyms (replacement terms or exact matches) for this ontology identifier using OLS."""
+    synonyms = set()
+    ontology_uri = OntologyUri(iden, db)
+    url = build_ols_query(str(ontology_uri))
+    json_response = json_request(url)
+    if json_response and '_embedded' in json_response:
+        for term in json_response['_embedded']['terms']:
+            # Get only EFO terms (even if imported)
+            if term['ontology_name'] == 'efo':
+                synonyms.add(term['iri'])
+                # Check whether current term is obsolete
+                if term['is_obsolete'] and term['term_replaced_by']:
+                    synonyms.add(term['term_replaced_by'])
+                # Also add exact matches
+                if 'exactMatch' in term['annotation']:
+                    synonyms.update(term['annotation']['exactMatch'])
+
+        # Synonyms contains current EFO-included URIs, convert to DB:ID style
+        synonyms = {OntologyUri.uri_to_curie(s) for s in synonyms}
+        # Filter out Nones
+        synonyms = {s for s in synonyms if s is not None}
+
+    if synonyms:
+        return ontology_uri.curie, synonyms
+    # If no synonyms, just return the original identifier as is
+    return ontology_uri.curie, {ontology_uri.curie}
+
+
+def main(clinvar_xml, output_file):
+    """Load ClinVar XML, map trait xrefs identifiers to synonyms in OLS, and dump results to TSV."""
+    traits = set()
+    for record in clinvar_xml_io.ClinVarDataset(clinvar_xml):
+        for trait in record.traits_with_valid_names:
+            traits.update([(db, iden) for db, iden, _ in trait.current_efo_aligned_xrefs])
+
+    traits = list(traits)
+    process_pool = multiprocessing.Pool(processes=24)
+    annotated_traits = [
+        process_pool.apply(get_synonyms, args=(db, iden))
+        for db, iden in traits
+    ]
+    with open(output_file, 'w+') as f:
+        csv.writer(f, delimiter="\t").writerows(annotated_traits)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Script to map trait xrefs in ClinVar to synonyms in OLS')
+    parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file')
+    parser.add_argument('--output-file', required=True, help='File to output dataframe')
+    args = parser.parse_args()
+    main(args.clinvar_xml, args.output_file)
diff --git a/bin/generate_annotated_xml.py b/bin/generate_annotated_xml.py
@@ -8,10 +8,12 @@
 parser.add_argument('--efo-mapping',  help='Disease string to ontology mappings',    required=True)
 parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
 parser.add_argument('--output-xml',   help='Output XML file',                        required=True)
+parser.add_argument('--eval-gene-file', help='Gene mappings for evaluation',         required=False)
+parser.add_argument('--eval-xref-file', help='Trait xref mappings for evaluation',   required=False)
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     generate_annotated_clinvar_xml(
         clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
-        output_xml_file=args.output_xml)
+        output_xml_file=args.output_xml, eval_gene_file=args.eval_gene_file, eval_xref_file=args.eval_xref_file)
diff --git a/bin/trait_mapping/create_efo_table.py b/bin/trait_mapping/create_efo_table.py
@@ -4,6 +4,9 @@
 import re
 import requests
 
+from requests import HTTPError
+from retry import retry
+
 # Name of ontology in OLS url, e. g. https://www.ebi.ac.uk/ols/ontologies/ordo/terms?iri=...
 ontology_to_ols = {
     'HP': 'hp',
@@ -49,15 +52,17 @@ def uri_to_curie(uri):
     return uri.split('/')[-1].replace('#', '').replace('_', ':')
 
 
+@retry(HTTPError, tries=4, delay=2, backoff=1.2, jitter=(1, 3))
 def get_cross_references(curie):
     """Queries OxO to return the list of cross-references for a given term curie."""
     url = oxo_url_template.format(curie=curie)
-    response = requests.get(url).json()
-    if '_embedded' not in response:
-        print('Warning: OxO error for term {}. No cross-links will be available for this term. '
-              'See https://github.com/EBISPOT/OXO/issues/26'.format(curie))
-        return []
-    mappings = response['_embedded']['searchResults'][0]['mappingResponseList']
+    response = requests.get(url)
+    response.raise_for_status()
+    json_response = response.json()
+    if '_embedded' not in json_response:
+        raise ValueError('Warning: OxO error for term {}. No cross-links will be available for this term. '
+                         'See https://github.com/EBISPOT/OXO/issues/26'.format(curie))
+    mappings = json_response['_embedded']['searchResults'][0]['mappingResponseList']
     return [m['curie'] for m in mappings]
 
 
@@ -79,7 +84,12 @@ def get_ols_details(ontology, term):
     # Cross-references
     term_curie = uri_to_curie(term)
     xrefs = {}
-    for x in get_cross_references(term_curie):
+    try:
+        oxo_xrefs = get_cross_references(term_curie)
+    except (HTTPError, ValueError) as e:
+        print('Warning: OxO error for term {}. No cross-links will be available for this term.'.format(term_curie))
+        oxo_xrefs = []
+    for x in oxo_xrefs:
         xref_ontology, xref_id = re.split('[_:]', x)
         xrefs.setdefault(xref_ontology, set()).add('{}:{}'.format(xref_ontology, xref_id))
 

diff --git a/bin/traits_to_zooma_format.py b/bin/traits_to_zooma_format.py
@@ -5,28 +5,7 @@
 from time import gmtime, strftime
 
 from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
-
-
-class OntologyUri:
-    # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
-    # some examples of how this looks like in ClinVar data.
-    db_to_uri_conversion = {
-        'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}',  # <XRef ID="1756" DB="Orphanet"/>
-        'omim': lambda x: f'https://www.omim.org/entry/{x}',  # <XRef Type="MIM" ID="612773" DB="OMIM"/>
-        'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}',  # <XRef ID="EFO_0005137" DB="EFO"/>
-        'mesh': lambda x: f'http://identifiers.org/mesh/{x}',  # <XRef ID="D065630" DB="MeSH"/>
-        'medgen': lambda x: f'http://identifiers.org/medgen/{x}',  # <XRef ID="C0235833" DB="MedGen"/>
-        # <XRef ID="MONDO:0013353" DB="MONDO"/>
-        'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
-    }
-
-    def __init__(self, id_, db):
-        self.id_ = id_
-        self.db = db
-        self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)
-
-    def __str__(self):
-        return self.uri
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
 
 
 def write_zooma_record(clinvar_acc, variant_id, trait_name, ontology_uri, date, outfile):

diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
@@ -85,6 +85,16 @@ def nsv_id(self):
             logger.warning(f'Found multiple NSV IDs for {self.clinvar_record}, this is not yet supported')
             return None
 
+    @property
+    def existing_so_terms(self):
+        all_so_terms = set()
+        for attr_elem in find_elements(self.measure_xml, './AttributeSet'):
+            if find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]'):
+                for so_elem in find_elements(attr_elem, './XRef[@DB="Sequence Ontology"]'):
+                    if 'providedBy' not in so_elem.attrib:
+                        all_so_terms.add(so_elem.attrib['ID'])
+        return all_so_terms
+
     @cached_property
     def _hgvs_to_types(self):
         """

diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py
@@ -17,6 +17,9 @@ class ClinVarTrait:
         'reclassified - variant of unknown significance', 'see cases', 'variant of unknown significance'
     }
 
+    # These database identifiers are directly importable into EFO
+    EFO_ALIGNED_ONTOLOGIES = {'Human Phenotype Ontology', 'EFO', 'Orphanet', 'MONDO'}
+
     def __init__(self, trait_xml, clinvar_record):
         self.trait_xml = trait_xml
         self.clinvar_record = clinvar_record
@@ -83,3 +86,9 @@ def medgen_id(self):
         else:
             logger.warning(f'Multiple MedGen IDs for {self}: {medgen_ids}')
             return medgen_ids[0]
+
+    @property
+    def current_efo_aligned_xrefs(self):
+        """Returns current EFO or EFO-aligned ids."""
+        return [(db, iden, status) for (db, iden, status) in self.xrefs
+                if status == 'current' and db in self.EFO_ALIGNED_ONTOLOGIES]
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py
@@ -0,0 +1,66 @@
+import logging
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class OntologyUri:
+    # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
+    # some examples of how this looks like in ClinVar data.
+    db_to_uri_conversion = {
+        'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}',  # <XRef ID="1756" DB="Orphanet"/>
+        'omim': lambda x: f'https://www.omim.org/entry/{x}',  # <XRef Type="MIM" ID="612773" DB="OMIM"/>
+        'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}',  # <XRef ID="EFO_0005137" DB="EFO"/>
+        'mesh': lambda x: f'http://identifiers.org/mesh/{x}',  # <XRef ID="D065630" DB="MeSH"/>
+        'medgen': lambda x: f'http://identifiers.org/medgen/{x}',  # <XRef ID="C0235833" DB="MedGen"/>
+        # <XRef ID="MONDO:0013353" DB="MONDO"/>
+        'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
+        # <XRef ID="HP:0011147" DB="Human Phenotype Ontology"/>
+        'hp': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
+    }
+
+    def __init__(self, id_, db):
+        self.id_ = id_
+        self.db = db if db.lower() != 'human phenotype ontology' else 'HP'
+        self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)
+
+    def __str__(self):
+        return self.uri
+
+    @property
+    def curie(self):
+        return self.uri_to_curie(self.uri)
+
+    @staticmethod
+    def uri_to_curie(uri):
+        """Convert an ontology uri to a DB:ID format."""
+        uri_db_to_curie_db = {
+            "ordo": "Orphanet",
+            "orphanet": "Orphanet",
+            "omim": "OMIM",
+            "efo": "EFO",
+            "hp": "HP",
+            "mondo": "MONDO",
+        }
+        if not any(x in uri.lower() for x in uri_db_to_curie_db.keys()):
+            return None
+        uri = uri.rstrip("/")
+        uri_list = uri.split("/")
+        if "identifiers.org" in uri:
+            db = uri_list[-2]
+            id_ = uri_list[-1]
+        elif "omim.org" in uri:
+            db = "OMIM"
+            id_ = uri_list[-1]
+        else:
+            last_component = uri_list[-1]
+            if ":" in last_component:
+                return last_component
+            elif "_" in last_component:
+                db, id_ = last_component.split("_")
+            else:
+                logger.warning(f"Could not convert URI to CURIE: {uri}")
+                return None
+        db = uri_db_to_curie_db[db.lower()]
+        return "{}:{}".format(db, id_)
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py
@@ -106,7 +106,7 @@ def parse_all_identifiers(clinvar_measure: ClinVarRecordMeasure):
     Repeat type must be present if possible, transcript id is useful if present."""
     # TODO should we prioritise these in some way?
     all_names = [clinvar_measure.get_variant_name_or_hgvs()] +\
-                [hgvs.text for hgvs in clinvar_measure.current_hgvs] +\
+                sorted([hgvs.text for hgvs in clinvar_measure.current_hgvs]) +\
                 clinvar_measure.all_names
     for name in all_names:
         repeat_type, transcript_id = infer_repeat_type_and_transcript_id(name)

diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py
@@ -14,6 +14,7 @@ def parse_header_attributes(clinvar_xml):
             if elem.tag == 'ReleaseSet':
                 attrib = elem.attrib
                 break
+            elem.clear()
     if not attrib:
         return {}
     # Resolve xsi:noNamespaceSchemaLocation="...", which is parsed strangely by ElementTree

diff --git a/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py b/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py
@@ -0,0 +1,11 @@
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
+
+
+def test_uri_to_curie():
+    assert OntologyUri.uri_to_curie('http://www.orpha.net/ORDO/Orphanet_713') == 'Orphanet:713'
+    assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/HP_0002930') == 'HP:0002930'
+    assert OntologyUri.uri_to_curie('https://omim.org/entry/300377') == 'OMIM:300377'
+    # If for some reason we're already passed a CURIE (probably a mistake in OLS response), return it as-is
+    assert OntologyUri.uri_to_curie('HP:0002505') == 'HP:0002505'
+    # Not a supported db
+    assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/DOID_10652') == None