Skip to content

Commit

Permalink
Merge pull request #394 from apriltuesday/EVA-3327_2
Browse files Browse the repository at this point in the history
Final counts for CMAT
  • Loading branch information
apriltuesday authored Jul 31, 2023
2 parents 5bbf36d + 8a9ce25 commit e73d993
Show file tree
Hide file tree
Showing 11 changed files with 123 additions and 47 deletions.
2 changes: 1 addition & 1 deletion bin/evaluation/map_xrefs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def main(clinvar_xml, output_file):
traits = list(traits)
process_pool = multiprocessing.Pool(processes=24)
annotated_traits = [
process_pool.apply(fetch_eval_data, kwds={'db_iden': (db, iden), 'include_neighbors': True})
process_pool.apply(fetch_eval_data, kwds={'db_iden': (db, iden), 'include_neighbors': False})
for db, iden in traits
]
with open(output_file, 'w+') as f:
Expand Down
6 changes: 3 additions & 3 deletions cmat/clinvar_xml_io/clinvar_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ def nsv_id(self):
def existing_so_terms(self):
all_so_terms = set()
for attr_elem in find_elements(self.measure_xml, './AttributeSet'):
if find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]'):
if ('providedBy' not in attr_elem.attrib
and find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]')):
for so_elem in find_elements(attr_elem, './XRef[@DB="Sequence Ontology"]'):
if 'providedBy' not in so_elem.attrib:
all_so_terms.add(so_elem.attrib['ID'])
all_so_terms.add(so_elem.attrib['ID'])
return all_so_terms

@cached_property
Expand Down
97 changes: 73 additions & 24 deletions cmat/output_generation/annotated_clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings
self.obsolete_counts = {}
self.gene_metrics = None
self.conseq_metrics = None
# Gene and consequence metrics, split by variant category
self.simple_variant_metrics = None
self.repeat_variant_metrics = None
self.complex_variant_metrics = None

self.trait_metrics = None
self.mismatches_file = None

Expand All @@ -40,6 +45,7 @@ def __iter__(self):
'total': 0,
'has_supported_measure': 0,
'has_supported_trait': 0,
'both_measure_and_trait': 0
}
self.obsolete_counts = {
'cv_total': 0, # total EFO xrefs used by ClinVar
Expand All @@ -49,6 +55,11 @@ def __iter__(self):
}
self.gene_metrics = SetComparisonMetrics()
self.conseq_metrics = SetComparisonMetrics()
# First counter is genes, second is consequences
self.simple_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics())
self.repeat_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics())
self.complex_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics())

self.trait_metrics = SetComparisonMetrics()
self.mismatches_file = open('mismatches.tsv', 'w+')
self.mismatches_file.write('RCV\tCV\tCMAT\n')
Expand All @@ -62,6 +73,8 @@ def __iter__(self):
self.gene_metrics.finalise()
self.conseq_metrics.finalise()
self.trait_metrics.finalise()
for metrics in self.simple_variant_metrics + self.repeat_variant_metrics + self.complex_variant_metrics:
metrics.finalise()
self.mismatches_file.close()

def annotate(self, record):
Expand All @@ -74,9 +87,11 @@ def annotate(self, record):
if record.traits_with_valid_names:
self.overall_counts['has_supported_trait'] += 1
self.annotate_and_count_traits(record)
if record.measure and record.traits_with_valid_names:
self.overall_counts['both_measure_and_trait'] += 1

def annotate_and_count_measure(self, record):
consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings)
consequence_types, variant_category = get_consequence_types(record.measure, self.variant_to_gene_mappings)
record.measure.add_ensembl_annotations(consequence_types)

annotated_genes = {ct.ensembl_gene_id for ct in consequence_types if ct.ensembl_gene_id}
Expand All @@ -87,6 +102,18 @@ def annotate_and_count_measure(self, record):
existing_ensembl_ids = self.eval_gene_mappings.get(record.accession, [])
self.gene_metrics.count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
self.conseq_metrics.count_and_score(cv_set=record.measure.existing_so_terms, cmat_set=annotated_conseqs)
if variant_category == 'SIMPLE':
self.simple_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
self.simple_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms,
cmat_set=annotated_conseqs)
elif variant_category == 'REPEAT':
self.repeat_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
self.repeat_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms,
cmat_set=annotated_conseqs)
elif variant_category == 'COMPLEX':
self.complex_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
self.complex_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms,
cmat_set=annotated_conseqs)

def annotate_and_count_traits(self, record):
for trait in record.traits_with_valid_names:
Expand All @@ -107,40 +134,41 @@ def annotate_and_count_traits(self, record):

# Evaluation
if self.eval_xref_mappings and self.eval_latest_mappings:
existing_current_efo_ids = set()
for cv_id in existing_efo_ids:
# Check whether existing ID is obsolete
self.obsolete_counts['cv_total'] += 1
if self.eval_xref_mappings[cv_id]['is_obsolete']:
self.obsolete_counts['cv_obsolete'] += 1
# Only record current EFO-contained IDs for comparison
elif self.eval_xref_mappings[cv_id]['synonyms']:
existing_current_efo_ids.add(cv_id)

annotated_efo_ids = set()
annotated_current_efo_ids = set()
for efo_id in efo_ids:

# Check whether annotated ID is obsolete
self.obsolete_counts['cmat_total'] += 1
if self.eval_latest_mappings[efo_id]['is_obsolete']:
self.obsolete_counts['cmat_obsolete'] += 1
# Don't add to the set of annotations if obsolete
continue

for cv_id in existing_efo_ids:
for cv_id in existing_current_efo_ids:
# Attempt to match an ID in ClinVar based on synonyms - if our ID is in the list of synonyms for
# a ClinVar ID (or vice versa), we use the synonymous ClinVar ID for comparison.
if (efo_id in self.eval_xref_mappings[cv_id]['synonyms']
or cv_id in self.eval_latest_mappings[efo_id]['synonyms']):
annotated_efo_ids.add(cv_id)
annotated_current_efo_ids.add(cv_id)

# Similarly attempt to match based on neighbors
# TODO include direction (parents vs. children) somehow
if (efo_id in self.eval_xref_mappings[cv_id]['parents']
or efo_id in self.eval_xref_mappings[cv_id]['children']):
annotated_efo_ids.add(cv_id)
# If didn't find anything, just use our ID, which will count as not matching.
if not annotated_efo_ids:
annotated_efo_ids.add(efo_id)
if not annotated_current_efo_ids:
annotated_current_efo_ids.add(efo_id)

self.trait_metrics.count_and_score(cv_set=existing_efo_ids, cmat_set=annotated_efo_ids)
self.trait_metrics.count_and_score(cv_set=existing_current_efo_ids, cmat_set=annotated_current_efo_ids)
# Output mismatches for manual inspection
if existing_efo_ids and annotated_efo_ids and len(existing_efo_ids & annotated_efo_ids) == 0:
self.mismatches_file.write(f"{record.accession}\t{','.join(existing_efo_ids)}\t{','.join(annotated_efo_ids)}\n")
if existing_current_efo_ids and annotated_current_efo_ids and len(existing_current_efo_ids & annotated_current_efo_ids) == 0:
self.mismatches_file.write(f"{record.accession}\t{','.join(existing_current_efo_ids)}\t{','.join(annotated_current_efo_ids)}\n")

def report(self):
print('\nOverall counts (RCVs):')
Expand All @@ -150,6 +178,21 @@ def report(self):
self.gene_metrics.report()
print('\nFunctional consequences:')
self.conseq_metrics.report()

print('\nBy variant type:')
print('\n\tSimple (genes):')
self.simple_variant_metrics[0].report()
print('\n\tSimple (consequences):')
self.simple_variant_metrics[1].report()
print('\n\tRepeat (genes):')
self.repeat_variant_metrics[0].report()
print('\n\tRepeat (consequences):')
self.repeat_variant_metrics[1].report()
print('\n\tComplex (genes):')
self.complex_variant_metrics[0].report()
print('\n\tComplex (consequences):')
self.complex_variant_metrics[1].report()

if self.eval_xref_mappings:
print('\nTrait mappings:')
self.trait_metrics.report()
Expand Down Expand Up @@ -193,8 +236,8 @@ class EnsemblAnnotatedClinVarMeasure(ClinVarRecordMeasure):
def add_ensembl_annotations(self, consequences):
consequence_elts = []
for consequence_attributes in consequences:
attr_set_elt = ET.Element('AttributeSet')
attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence', 'providedBy': PROCESSOR})
attr_set_elt = ET.Element('AttributeSet', attrib={'providedBy': PROCESSOR})
attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence'})
attribute_elt.text = consequence_attributes.so_term.so_name.replace('_', ' ')
so_elt = ET.Element('XRef', attrib={'ID': self.format_so_term(consequence_attributes.so_term),
'DB': 'Sequence Ontology'})
Expand Down Expand Up @@ -238,19 +281,25 @@ def load_evaluation_xref_mappings(input_path):
with open(input_path) as input_file:
for line in input_file:
cols = line.strip().split('\t')
if len(cols) != 5:
if len(cols) == 5:
mapping[cols[0]] = {
'is_obsolete': cols[1] == 'True',
'synonyms': string_to_set(cols[2]),
'parents': string_to_set(cols[3]),
'children': string_to_set(cols[4])
}
elif len(cols) == 3:
mapping[cols[0]] = {
'is_obsolete': cols[1] == 'True',
'synonyms': string_to_set(cols[2])
}
else:
continue
mapping[cols[0]] = {
'is_obsolete': cols[1] == 'True',
'synonyms': string_to_set(cols[2]),
'parents': string_to_set(cols[3]),
'children': string_to_set(cols[4])
}
return mapping


def string_to_set(s):
return set(re.sub(r"{|}|'", '', s).split(', '))
return set(x for x in re.sub(r"{|}|'", '', s).split(', ') if x)


def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file,
Expand Down
12 changes: 6 additions & 6 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
# origins, (2) EFO mappings, and (3) genes where the variant has effect.
grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

Expand Down Expand Up @@ -290,7 +290,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict):
# expansion variants will be fed to VEP and will receive incorrect consequence annotations. By using RCV pairing
# first, we prioritise results of the variant expansion pipeline over the general VEP pipeline.
if clinvar_record_measure.clinvar_record.accession in consequence_type_dict:
return consequence_type_dict[clinvar_record_measure.clinvar_record.accession]
return consequence_type_dict[clinvar_record_measure.clinvar_record.accession], 'REPEAT'

# If RCV is not present in the consequences file, pair using full variant description (CHROM:POS:REF:ALT)
if clinvar_record_measure.has_complete_coordinates:
Expand All @@ -302,7 +302,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict):
if IUPAC_AMBIGUOUS_SEQUENCE.search(clinvar_record_measure.vcf_ref + clinvar_record_measure.vcf_alt):
logger.warning(f'Observed variant with non-ACGT allele sequences: {coord_id}')
if coord_id in consequence_type_dict:
return consequence_type_dict[coord_id]
return consequence_type_dict[coord_id], 'SIMPLE'

# If there's also no complete coordinates, pair using HGVS
if clinvar_record_measure.preferred_current_hgvs:
Expand All @@ -311,12 +311,12 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict):
consequences = consequence_type_dict[hgvs_id]
if len(consequences) > MAX_TARGET_GENES:
logger.warning(f'Skipping variant {hgvs_id} with {len(consequences)} target genes')
return []
return consequences
return [], 'NONE'
return consequences, 'COMPLEX'

# Previously, the pairing was also attempted based on rsID and nsvID. This is not reliable because of lack of allele
# specificity, and has been removed.
return []
return [], 'NONE'


def write_string_list_to_file(string_list, filename):
Expand Down
12 changes: 9 additions & 3 deletions cmat/output_generation/evaluation/ols_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
from functools import lru_cache

from requests import RequestException

from cmat.clinvar_xml_io.ontology_uri import OntologyUri
from cmat.trait_mapping.utils import json_request
from cmat.trait_mapping.ols import build_ols_query
Expand All @@ -26,14 +28,18 @@ def fetch_eval_data(*, db_iden=None, uri=None, include_neighbors=False):
return None
curie = OntologyUri.uri_to_curie(ontology_uri)

# Defaults to return if OLS query fails
# Defaults to return if OLS query fails or no term in EFO
is_obsolete = False
synonyms = {curie}
synonyms = {}
parents = {}
children = {}

url = build_ols_query(ontology_uri)
json_response = json_request(url)
try:
json_response = json_request(url)
except RequestException:
logger.warning(f'OLS4 error for {url}, trying OLS3...')
json_response = json_request(url.replace('/ols4/', '/ols/'))
if json_response and '_embedded' in json_response:
for term in json_response['_embedded']['terms']:
# Get only EFO terms (even if imported)
Expand Down
5 changes: 4 additions & 1 deletion cmat/trait_mapping/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
import logging
import multiprocessing
from collections import Counter

from cmat.clinvar_xml_io import ClinVarTrait
from cmat.trait_mapping.output import output_trait
Expand Down Expand Up @@ -121,7 +122,9 @@ def process_traits(traits_filepath, output_mappings_filepath, output_curation_fi
]

logger.info('Writing output with the processed traits')
finished_source_counts = Counter()
for trait in processed_trait_list:
output_trait(trait, mapping_writer, curation_writer)
output_trait(trait, mapping_writer, curation_writer, finished_source_counts)

logger.info('Finished processing trait names')
logger.info(f'Source counts for finished mappings: {finished_source_counts}')
18 changes: 15 additions & 3 deletions cmat/trait_mapping/output.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
import csv
from collections import Counter

from cmat.trait_mapping.trait import Trait


def output_trait_mapping(trait: Trait, mapping_writer: csv.writer):
def output_trait_mapping(trait: Trait, mapping_writer: csv.writer, finished_source_counts: Counter = None):
"""
Write any finished ontology mappings for a trait to a csv file writer.
:param trait: A trait with finished ontology mappings in finished_mapping_set
:param mapping_writer: A csv.writer to write the finished mappings
:param finished_source_counts: Optional Counter to count sources of finished mappings
"""
for ontology_entry in trait.finished_mapping_set:
# Need the corresponding Zooma result
zooma_mapping = None
for zooma_result in trait.zooma_result_list:
for zm in zooma_result.mapping_list:
if (zm.in_efo and zm.is_current and ontology_entry.uri == zm.uri
and ontology_entry.label == zm.ontology_label):
zooma_mapping = zm
break
if zooma_mapping and finished_source_counts:
finished_source_counts[zooma_mapping.source.lower()] += 1
mapping_writer.writerow([trait.name, ontology_entry.uri, ontology_entry.label])


Expand Down Expand Up @@ -55,7 +67,7 @@ def output_for_curation(trait: Trait, curation_writer: csv.writer):
curation_writer.writerow(output_row)


def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv.writer):
def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv.writer, finished_source_counts: Counter):
"""
Output finished ontology mappings of a trait, or non-finished mappings (if any) for curation.
Expand All @@ -64,6 +76,6 @@ def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv.
:param curation_writer: A csv.writer to write non-finished ontology mappings for manual curation
"""
if trait.is_finished:
output_trait_mapping(trait, mapping_writer)
output_trait_mapping(trait, mapping_writer, finished_source_counts)
else:
output_for_curation(trait, curation_writer)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def get_requires():
long_description = fh.read()

setup(name='cmat',
version='3.0.1',
version='3.0.2',
author_email='opentargets-clinvar@ebi.ac.uk',
url='https://github.com/EBIvariation/eva-opentargets',
packages=find_packages(),
Expand Down
Binary file not shown.
8 changes: 7 additions & 1 deletion tests/output_generation/test_annotated_clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@
import os
import re

from cmat.output_generation.annotated_clinvar import generate_annotated_clinvar_xml
from cmat.output_generation.annotated_clinvar import generate_annotated_clinvar_xml, string_to_set

resources_dir = os.path.join(os.path.dirname(__file__), 'resources')


def test_string_to_set():
assert string_to_set("{'HP:0002269'}") == {'HP:0002269'}
assert string_to_set("{'Orphanet:49382', 'MONDO:0018852'}") == {'Orphanet:49382', 'MONDO:0018852'}
assert string_to_set('{}') == set()


def test_generate_annotated_xml():
input_file = os.path.join(resources_dir, 'test_annotation_input.xml.gz')
efo_mapping_file = os.path.join(resources_dir, 'string_to_ontology_mappings.tsv')
Expand Down
Loading

0 comments on commit e73d993

Please sign in to comment.