Skip to content

Commit

Permalink
Merge pull request #371 from apriltuesday/results-for-paper
Browse files Browse the repository at this point in the history
Add counts to annotated XML generation
  • Loading branch information
apriltuesday authored Apr 26, 2023
2 parents 6d36caa + 18673c7 commit b1778c9
Show file tree
Hide file tree
Showing 18 changed files with 580 additions and 47 deletions.
65 changes: 65 additions & 0 deletions bin/evaluation/map_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python3
import argparse

import pandas as pd

from consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info
from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io


def load_clinvar_data(clinvar_xml):
"""Load ClinVar data, process for gene symbols and HGNC IDs, and return it as a Pandas dataframe.
Modified from similar functionality in the repeat expansion pipeline."""
variant_data = [] # To populate the return dataframe (see columns below)
for clinvar_record in clinvar_xml_io.ClinVarDataset(clinvar_xml):
# Skip a record if it does not contain variant information
if not clinvar_record.measure:
continue
measure = clinvar_record.measure

# Extract gene symbol(s). Here and below, dashes are sometimes assigned to be compatible with the variant
# summary format which was used previously.
gene_symbols = measure.preferred_gene_symbols
if not gene_symbols:
gene_symbols = ['-']

# Extract HGNC ID
hgnc_ids = measure.hgnc_ids
hgnc_id = hgnc_ids[0] if len(hgnc_ids) == 1 and len(gene_symbols) == 1 else '-'

# Append data strings (including unused columns for compatibility)
for gene_symbol in gene_symbols:
variant_data.append([
measure.preferred_or_other_name,
clinvar_record.accession,
gene_symbol,
hgnc_id,
None,
None
])

variants = pd.DataFrame(variant_data, columns=('Name',
'RCVaccession',
'GeneSymbol',
'HGNC_ID',
'TranscriptID',
'RepeatType'))

# Since the same record can have coordinates in multiple builds, it can be repeated. Remove duplicates
variants = variants.drop_duplicates()
return variants


def main(clinvar_xml, output_file):
"""Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV."""
variants = load_clinvar_data(clinvar_xml)
annotated_variants = annotate_ensembl_gene_info(variants)
annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Script to convert HGNC IDs and gene symbols in ClinVar to Ensembl')
parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file')
parser.add_argument('--output-file', required=True, help='File to output dataframe')
args = parser.parse_args()
main(args.clinvar_xml, args.output_file)
65 changes: 65 additions & 0 deletions bin/evaluation/map_xrefs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python3
import argparse
import csv
import multiprocessing
from functools import lru_cache

from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
from eva_cttv_pipeline.trait_mapping.ols import build_ols_query
from eva_cttv_pipeline.trait_mapping.utils import json_request


@lru_cache
def get_synonyms(db, iden):
"""Find synonyms (replacement terms or exact matches) for this ontology identifier using OLS."""
synonyms = set()
ontology_uri = OntologyUri(iden, db)
url = build_ols_query(str(ontology_uri))
json_response = json_request(url)
if json_response and '_embedded' in json_response:
for term in json_response['_embedded']['terms']:
# Get only EFO terms (even if imported)
if term['ontology_name'] == 'efo':
synonyms.add(term['iri'])
# Check whether current term is obsolete
if term['is_obsolete'] and term['term_replaced_by']:
synonyms.add(term['term_replaced_by'])
# Also add exact matches
if 'exactMatch' in term['annotation']:
synonyms.update(term['annotation']['exactMatch'])

# Synonyms contains current EFO-included URIs, convert to DB:ID style
synonyms = {OntologyUri.uri_to_curie(s) for s in synonyms}
# Filter out Nones
synonyms = {s for s in synonyms if s is not None}

if synonyms:
return ontology_uri.curie, synonyms
# If no synonyms, just return the original identifier as is
return ontology_uri.curie, {ontology_uri.curie}


def main(clinvar_xml, output_file):
"""Load ClinVar XML, map trait xrefs identifiers to synonyms in OLS, and dump results to TSV."""
traits = set()
for record in clinvar_xml_io.ClinVarDataset(clinvar_xml):
for trait in record.traits_with_valid_names:
traits.update([(db, iden) for db, iden, _ in trait.current_efo_aligned_xrefs])

traits = list(traits)
process_pool = multiprocessing.Pool(processes=24)
annotated_traits = [
process_pool.apply(get_synonyms, args=(db, iden))
for db, iden in traits
]
with open(output_file, 'w+') as f:
csv.writer(f, delimiter="\t").writerows(annotated_traits)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Script to map trait xrefs in ClinVar to synonyms in OLS')
parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file')
parser.add_argument('--output-file', required=True, help='File to output dataframe')
args = parser.parse_args()
main(args.clinvar_xml, args.output_file)
4 changes: 3 additions & 1 deletion bin/generate_annotated_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
parser.add_argument('--efo-mapping', help='Disease string to ontology mappings', required=True)
parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
parser.add_argument('--output-xml', help='Output XML file', required=True)
parser.add_argument('--eval-gene-file', help='Gene mappings for evaluation', required=False)
parser.add_argument('--eval-xref-file', help='Trait xref mappings for evaluation', required=False)


if __name__ == '__main__':
args = parser.parse_args()
generate_annotated_clinvar_xml(
clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
output_xml_file=args.output_xml)
output_xml_file=args.output_xml, eval_gene_file=args.eval_gene_file, eval_xref_file=args.eval_xref_file)
24 changes: 17 additions & 7 deletions bin/trait_mapping/create_efo_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import re
import requests

from requests import HTTPError
from retry import retry

# Name of ontology in OLS url, e. g. https://www.ebi.ac.uk/ols/ontologies/ordo/terms?iri=...
ontology_to_ols = {
'HP': 'hp',
Expand Down Expand Up @@ -49,15 +52,17 @@ def uri_to_curie(uri):
return uri.split('/')[-1].replace('#', '').replace('_', ':')


@retry(HTTPError, tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def get_cross_references(curie):
"""Queries OxO to return the list of cross-references for a given term curie."""
url = oxo_url_template.format(curie=curie)
response = requests.get(url).json()
if '_embedded' not in response:
print('Warning: OxO error for term {}. No cross-links will be available for this term. '
'See https://github.com/EBISPOT/OXO/issues/26'.format(curie))
return []
mappings = response['_embedded']['searchResults'][0]['mappingResponseList']
response = requests.get(url)
response.raise_for_status()
json_response = response.json()
if '_embedded' not in json_response:
raise ValueError('Warning: OxO error for term {}. No cross-links will be available for this term. '
'See https://github.com/EBISPOT/OXO/issues/26'.format(curie))
mappings = json_response['_embedded']['searchResults'][0]['mappingResponseList']
return [m['curie'] for m in mappings]


Expand All @@ -79,7 +84,12 @@ def get_ols_details(ontology, term):
# Cross-references
term_curie = uri_to_curie(term)
xrefs = {}
for x in get_cross_references(term_curie):
try:
oxo_xrefs = get_cross_references(term_curie)
except (HTTPError, ValueError) as e:
print('Warning: OxO error for term {}. No cross-links will be available for this term.'.format(term_curie))
oxo_xrefs = []
for x in oxo_xrefs:
xref_ontology, xref_id = re.split('[_:]', x)
xrefs.setdefault(xref_ontology, set()).add('{}:{}'.format(xref_ontology, xref_id))

Expand Down
23 changes: 1 addition & 22 deletions bin/traits_to_zooma_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,7 @@
from time import gmtime, strftime

from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io


class OntologyUri:
# ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
# some examples of how this looks like in ClinVar data.
db_to_uri_conversion = {
'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}', # <XRef ID="1756" DB="Orphanet"/>
'omim': lambda x: f'https://www.omim.org/entry/{x}', # <XRef Type="MIM" ID="612773" DB="OMIM"/>
'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}', # <XRef ID="EFO_0005137" DB="EFO"/>
'mesh': lambda x: f'http://identifiers.org/mesh/{x}', # <XRef ID="D065630" DB="MeSH"/>
'medgen': lambda x: f'http://identifiers.org/medgen/{x}', # <XRef ID="C0235833" DB="MedGen"/>
# <XRef ID="MONDO:0013353" DB="MONDO"/>
'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
}

def __init__(self, id_, db):
self.id_ = id_
self.db = db
self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)

def __str__(self):
return self.uri
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri


def write_zooma_record(clinvar_acc, variant_id, trait_name, ontology_uri, date, outfile):
Expand Down
10 changes: 10 additions & 0 deletions eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,16 @@ def nsv_id(self):
logger.warning(f'Found multiple NSV IDs for {self.clinvar_record}, this is not yet supported')
return None

@property
def existing_so_terms(self):
all_so_terms = set()
for attr_elem in find_elements(self.measure_xml, './AttributeSet'):
if find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]'):
for so_elem in find_elements(attr_elem, './XRef[@DB="Sequence Ontology"]'):
if 'providedBy' not in so_elem.attrib:
all_so_terms.add(so_elem.attrib['ID'])
return all_so_terms

@cached_property
def _hgvs_to_types(self):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ class ClinVarTrait:
'reclassified - variant of unknown significance', 'see cases', 'variant of unknown significance'
}

# These database identifiers are directly importable into EFO
EFO_ALIGNED_ONTOLOGIES = {'Human Phenotype Ontology', 'EFO', 'Orphanet', 'MONDO'}

def __init__(self, trait_xml, clinvar_record):
self.trait_xml = trait_xml
self.clinvar_record = clinvar_record
Expand Down Expand Up @@ -83,3 +86,9 @@ def medgen_id(self):
else:
logger.warning(f'Multiple MedGen IDs for {self}: {medgen_ids}')
return medgen_ids[0]

@property
def current_efo_aligned_xrefs(self):
"""Returns current EFO or EFO-aligned ids."""
return [(db, iden, status) for (db, iden, status) in self.xrefs
if status == 'current' and db in self.EFO_ALIGNED_ONTOLOGIES]
66 changes: 66 additions & 0 deletions eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import logging

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class OntologyUri:
# ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
# some examples of how this looks like in ClinVar data.
db_to_uri_conversion = {
'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}', # <XRef ID="1756" DB="Orphanet"/>
'omim': lambda x: f'https://www.omim.org/entry/{x}', # <XRef Type="MIM" ID="612773" DB="OMIM"/>
'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}', # <XRef ID="EFO_0005137" DB="EFO"/>
'mesh': lambda x: f'http://identifiers.org/mesh/{x}', # <XRef ID="D065630" DB="MeSH"/>
'medgen': lambda x: f'http://identifiers.org/medgen/{x}', # <XRef ID="C0235833" DB="MedGen"/>
# <XRef ID="MONDO:0013353" DB="MONDO"/>
'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
# <XRef ID="HP:0011147" DB="Human Phenotype Ontology"/>
'hp': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
}

def __init__(self, id_, db):
self.id_ = id_
self.db = db if db.lower() != 'human phenotype ontology' else 'HP'
self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)

def __str__(self):
return self.uri

@property
def curie(self):
return self.uri_to_curie(self.uri)

@staticmethod
def uri_to_curie(uri):
"""Convert an ontology uri to a DB:ID format."""
uri_db_to_curie_db = {
"ordo": "Orphanet",
"orphanet": "Orphanet",
"omim": "OMIM",
"efo": "EFO",
"hp": "HP",
"mondo": "MONDO",
}
if not any(x in uri.lower() for x in uri_db_to_curie_db.keys()):
return None
uri = uri.rstrip("/")
uri_list = uri.split("/")
if "identifiers.org" in uri:
db = uri_list[-2]
id_ = uri_list[-1]
elif "omim.org" in uri:
db = "OMIM"
id_ = uri_list[-1]
else:
last_component = uri_list[-1]
if ":" in last_component:
return last_component
elif "_" in last_component:
db, id_ = last_component.split("_")
else:
logger.warning(f"Could not convert URI to CURIE: {uri}")
return None
db = uri_db_to_curie_db[db.lower()]
return "{}:{}".format(db, id_)
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def parse_all_identifiers(clinvar_measure: ClinVarRecordMeasure):
Repeat type must be present if possible, transcript id is useful if present."""
# TODO should we prioritise these in some way?
all_names = [clinvar_measure.get_variant_name_or_hgvs()] +\
[hgvs.text for hgvs in clinvar_measure.current_hgvs] +\
sorted([hgvs.text for hgvs in clinvar_measure.current_hgvs]) +\
clinvar_measure.all_names
for name in all_names:
repeat_type, transcript_id = infer_repeat_type_and_transcript_id(name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def parse_header_attributes(clinvar_xml):
if elem.tag == 'ReleaseSet':
attrib = elem.attrib
break
elem.clear()
if not attrib:
return {}
# Resolve xsi:noNamespaceSchemaLocation="...", which is parsed strangely by ElementTree
Expand Down
11 changes: 11 additions & 0 deletions eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri


def test_uri_to_curie():
assert OntologyUri.uri_to_curie('http://www.orpha.net/ORDO/Orphanet_713') == 'Orphanet:713'
assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/HP_0002930') == 'HP:0002930'
assert OntologyUri.uri_to_curie('https://omim.org/entry/300377') == 'OMIM:300377'
# If for some reason we're already passed a CURIE (probably a mistake in OLS response), return it as-is
assert OntologyUri.uri_to_curie('HP:0002505') == 'HP:0002505'
# Not a supported db
assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/DOID_10652') == None
Loading

0 comments on commit b1778c9

Please sign in to comment.