-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #371 from apriltuesday/results-for-paper
Add counts to annotated XML generation
- Loading branch information
Showing
18 changed files
with
580 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
|
||
import pandas as pd | ||
|
||
from consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info | ||
from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io | ||
|
||
|
||
def load_clinvar_data(clinvar_xml): | ||
"""Load ClinVar data, process for gene symbols and HGNC IDs, and return it as a Pandas dataframe. | ||
Modified from similar functionality in the repeat expansion pipeline.""" | ||
variant_data = [] # To populate the return dataframe (see columns below) | ||
for clinvar_record in clinvar_xml_io.ClinVarDataset(clinvar_xml): | ||
# Skip a record if it does not contain variant information | ||
if not clinvar_record.measure: | ||
continue | ||
measure = clinvar_record.measure | ||
|
||
# Extract gene symbol(s). Here and below, dashes are sometimes assigned to be compatible with the variant | ||
# summary format which was used previously. | ||
gene_symbols = measure.preferred_gene_symbols | ||
if not gene_symbols: | ||
gene_symbols = ['-'] | ||
|
||
# Extract HGNC ID | ||
hgnc_ids = measure.hgnc_ids | ||
hgnc_id = hgnc_ids[0] if len(hgnc_ids) == 1 and len(gene_symbols) == 1 else '-' | ||
|
||
# Append data strings (including unused columns for compatibility) | ||
for gene_symbol in gene_symbols: | ||
variant_data.append([ | ||
measure.preferred_or_other_name, | ||
clinvar_record.accession, | ||
gene_symbol, | ||
hgnc_id, | ||
None, | ||
None | ||
]) | ||
|
||
variants = pd.DataFrame(variant_data, columns=('Name', | ||
'RCVaccession', | ||
'GeneSymbol', | ||
'HGNC_ID', | ||
'TranscriptID', | ||
'RepeatType')) | ||
|
||
# Since the same record can have coordinates in multiple builds, it can be repeated. Remove duplicates | ||
variants = variants.drop_duplicates() | ||
return variants | ||
|
||
|
||
def main(clinvar_xml, output_file): | ||
"""Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV.""" | ||
variants = load_clinvar_data(clinvar_xml) | ||
annotated_variants = annotate_ensembl_gene_info(variants) | ||
annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Script to convert HGNC IDs and gene symbols in ClinVar to Ensembl') | ||
parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file') | ||
parser.add_argument('--output-file', required=True, help='File to output dataframe') | ||
args = parser.parse_args() | ||
main(args.clinvar_xml, args.output_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
import csv | ||
import multiprocessing | ||
from functools import lru_cache | ||
|
||
from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io | ||
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri | ||
from eva_cttv_pipeline.trait_mapping.ols import build_ols_query | ||
from eva_cttv_pipeline.trait_mapping.utils import json_request | ||
|
||
|
||
@lru_cache | ||
def get_synonyms(db, iden): | ||
"""Find synonyms (replacement terms or exact matches) for this ontology identifier using OLS.""" | ||
synonyms = set() | ||
ontology_uri = OntologyUri(iden, db) | ||
url = build_ols_query(str(ontology_uri)) | ||
json_response = json_request(url) | ||
if json_response and '_embedded' in json_response: | ||
for term in json_response['_embedded']['terms']: | ||
# Get only EFO terms (even if imported) | ||
if term['ontology_name'] == 'efo': | ||
synonyms.add(term['iri']) | ||
# Check whether current term is obsolete | ||
if term['is_obsolete'] and term['term_replaced_by']: | ||
synonyms.add(term['term_replaced_by']) | ||
# Also add exact matches | ||
if 'exactMatch' in term['annotation']: | ||
synonyms.update(term['annotation']['exactMatch']) | ||
|
||
# Synonyms contains current EFO-included URIs, convert to DB:ID style | ||
synonyms = {OntologyUri.uri_to_curie(s) for s in synonyms} | ||
# Filter out Nones | ||
synonyms = {s for s in synonyms if s is not None} | ||
|
||
if synonyms: | ||
return ontology_uri.curie, synonyms | ||
# If no synonyms, just return the original identifier as is | ||
return ontology_uri.curie, {ontology_uri.curie} | ||
|
||
|
||
def main(clinvar_xml, output_file): | ||
"""Load ClinVar XML, map trait xrefs identifiers to synonyms in OLS, and dump results to TSV.""" | ||
traits = set() | ||
for record in clinvar_xml_io.ClinVarDataset(clinvar_xml): | ||
for trait in record.traits_with_valid_names: | ||
traits.update([(db, iden) for db, iden, _ in trait.current_efo_aligned_xrefs]) | ||
|
||
traits = list(traits) | ||
process_pool = multiprocessing.Pool(processes=24) | ||
annotated_traits = [ | ||
process_pool.apply(get_synonyms, args=(db, iden)) | ||
for db, iden in traits | ||
] | ||
with open(output_file, 'w+') as f: | ||
csv.writer(f, delimiter="\t").writerows(annotated_traits) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Script to map trait xrefs in ClinVar to synonyms in OLS') | ||
parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file') | ||
parser.add_argument('--output-file', required=True, help='File to output dataframe') | ||
args = parser.parse_args() | ||
main(args.clinvar_xml, args.output_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import logging | ||
|
||
logging.basicConfig() | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
|
||
class OntologyUri: | ||
# ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with | ||
# some examples of how this looks like in ClinVar data. | ||
db_to_uri_conversion = { | ||
'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}', # <XRef ID="1756" DB="Orphanet"/> | ||
'omim': lambda x: f'https://www.omim.org/entry/{x}', # <XRef Type="MIM" ID="612773" DB="OMIM"/> | ||
'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}', # <XRef ID="EFO_0005137" DB="EFO"/> | ||
'mesh': lambda x: f'http://identifiers.org/mesh/{x}', # <XRef ID="D065630" DB="MeSH"/> | ||
'medgen': lambda x: f'http://identifiers.org/medgen/{x}', # <XRef ID="C0235833" DB="MedGen"/> | ||
# <XRef ID="MONDO:0013353" DB="MONDO"/> | ||
'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')), | ||
# <XRef ID="HP:0011147" DB="Human Phenotype Ontology"/> | ||
'hp': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')), | ||
} | ||
|
||
def __init__(self, id_, db): | ||
self.id_ = id_ | ||
self.db = db if db.lower() != 'human phenotype ontology' else 'HP' | ||
self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_) | ||
|
||
def __str__(self): | ||
return self.uri | ||
|
||
@property | ||
def curie(self): | ||
return self.uri_to_curie(self.uri) | ||
|
||
@staticmethod | ||
def uri_to_curie(uri): | ||
"""Convert an ontology uri to a DB:ID format.""" | ||
uri_db_to_curie_db = { | ||
"ordo": "Orphanet", | ||
"orphanet": "Orphanet", | ||
"omim": "OMIM", | ||
"efo": "EFO", | ||
"hp": "HP", | ||
"mondo": "MONDO", | ||
} | ||
if not any(x in uri.lower() for x in uri_db_to_curie_db.keys()): | ||
return None | ||
uri = uri.rstrip("/") | ||
uri_list = uri.split("/") | ||
if "identifiers.org" in uri: | ||
db = uri_list[-2] | ||
id_ = uri_list[-1] | ||
elif "omim.org" in uri: | ||
db = "OMIM" | ||
id_ = uri_list[-1] | ||
else: | ||
last_component = uri_list[-1] | ||
if ":" in last_component: | ||
return last_component | ||
elif "_" in last_component: | ||
db, id_ = last_component.split("_") | ||
else: | ||
logger.warning(f"Could not convert URI to CURIE: {uri}") | ||
return None | ||
db = uri_db_to_curie_db[db.lower()] | ||
return "{}:{}".format(db, id_) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
11 changes: 11 additions & 0 deletions
11
eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri | ||
|
||
|
||
def test_uri_to_curie(): | ||
assert OntologyUri.uri_to_curie('http://www.orpha.net/ORDO/Orphanet_713') == 'Orphanet:713' | ||
assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/HP_0002930') == 'HP:0002930' | ||
assert OntologyUri.uri_to_curie('https://omim.org/entry/300377') == 'OMIM:300377' | ||
# If for some reason we're already passed a CURIE (probably a mistake in OLS response), return it as-is | ||
assert OntologyUri.uri_to_curie('HP:0002505') == 'HP:0002505' | ||
# Not a supported db | ||
assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/DOID_10652') == None |
Oops, something went wrong.