diff --git a/bin/evaluation/map_xrefs.py b/bin/evaluation/map_xrefs.py index d2762294..bf6cd0ce 100644 --- a/bin/evaluation/map_xrefs.py +++ b/bin/evaluation/map_xrefs.py @@ -17,7 +17,7 @@ def main(clinvar_xml, output_file): traits = list(traits) process_pool = multiprocessing.Pool(processes=24) annotated_traits = [ - process_pool.apply(fetch_eval_data, kwds={'db_iden': (db, iden), 'include_neighbors': True}) + process_pool.apply(fetch_eval_data, kwds={'db_iden': (db, iden), 'include_neighbors': False}) for db, iden in traits ] with open(output_file, 'w+') as f: diff --git a/cmat/clinvar_xml_io/clinvar_measure.py b/cmat/clinvar_xml_io/clinvar_measure.py index 14ae73b6..18ace3ef 100644 --- a/cmat/clinvar_xml_io/clinvar_measure.py +++ b/cmat/clinvar_xml_io/clinvar_measure.py @@ -89,10 +89,10 @@ def nsv_id(self): def existing_so_terms(self): all_so_terms = set() for attr_elem in find_elements(self.measure_xml, './AttributeSet'): - if find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]'): + if ('providedBy' not in attr_elem.attrib + and find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]')): for so_elem in find_elements(attr_elem, './XRef[@DB="Sequence Ontology"]'): - if 'providedBy' not in so_elem.attrib: - all_so_terms.add(so_elem.attrib['ID']) + all_so_terms.add(so_elem.attrib['ID']) return all_so_terms @cached_property diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py index ff6bc51e..bba3bb42 100644 --- a/cmat/output_generation/annotated_clinvar.py +++ b/cmat/output_generation/annotated_clinvar.py @@ -31,6 +31,11 @@ def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings self.obsolete_counts = {} self.gene_metrics = None self.conseq_metrics = None + # Gene and consequence metrics, split by variant category + self.simple_variant_metrics = None + self.repeat_variant_metrics = None + self.complex_variant_metrics = None + self.trait_metrics = None self.mismatches_file = None @@ -40,6 +45,7 @@ def __iter__(self): 'total': 0, 'has_supported_measure': 0, 'has_supported_trait': 0, + 'both_measure_and_trait': 0 } self.obsolete_counts = { 'cv_total': 0, # total EFO xrefs used by ClinVar @@ -49,6 +55,11 @@ def __iter__(self): } self.gene_metrics = SetComparisonMetrics() self.conseq_metrics = SetComparisonMetrics() + # First counter is genes, second is consequences + self.simple_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics()) + self.repeat_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics()) + self.complex_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics()) + self.trait_metrics = SetComparisonMetrics() self.mismatches_file = open('mismatches.tsv', 'w+') self.mismatches_file.write('RCV\tCV\tCMAT\n') @@ -62,6 +73,8 @@ def __iter__(self): self.gene_metrics.finalise() self.conseq_metrics.finalise() self.trait_metrics.finalise() + for metrics in self.simple_variant_metrics + self.repeat_variant_metrics + self.complex_variant_metrics: + metrics.finalise() self.mismatches_file.close() def annotate(self, record): @@ -74,9 +87,11 @@ def annotate(self, record): if record.traits_with_valid_names: self.overall_counts['has_supported_trait'] += 1 self.annotate_and_count_traits(record) + if record.measure and record.traits_with_valid_names: + self.overall_counts['both_measure_and_trait'] += 1 def annotate_and_count_measure(self, record): - consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings) + consequence_types, variant_category = get_consequence_types(record.measure, self.variant_to_gene_mappings) record.measure.add_ensembl_annotations(consequence_types) annotated_genes = {ct.ensembl_gene_id for ct in consequence_types if ct.ensembl_gene_id} @@ -87,6 +102,18 @@ def annotate_and_count_measure(self, record): existing_ensembl_ids = self.eval_gene_mappings.get(record.accession, []) self.gene_metrics.count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes) self.conseq_metrics.count_and_score(cv_set=record.measure.existing_so_terms, cmat_set=annotated_conseqs) + if variant_category == 'SIMPLE': + self.simple_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes) + self.simple_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms, + cmat_set=annotated_conseqs) + elif variant_category == 'REPEAT': + self.repeat_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes) + self.repeat_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms, + cmat_set=annotated_conseqs) + elif variant_category == 'COMPLEX': + self.complex_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes) + self.complex_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms, + cmat_set=annotated_conseqs) def annotate_and_count_traits(self, record): for trait in record.traits_with_valid_names: @@ -107,40 +134,41 @@ def annotate_and_count_traits(self, record): # Evaluation if self.eval_xref_mappings and self.eval_latest_mappings: + existing_current_efo_ids = set() for cv_id in existing_efo_ids: # Check whether existing ID is obsolete self.obsolete_counts['cv_total'] += 1 if self.eval_xref_mappings[cv_id]['is_obsolete']: self.obsolete_counts['cv_obsolete'] += 1 + # Only record current EFO-contained IDs for comparison + elif self.eval_xref_mappings[cv_id]['synonyms']: + existing_current_efo_ids.add(cv_id) - annotated_efo_ids = set() + annotated_current_efo_ids = set() for efo_id in efo_ids: # Check whether annotated ID is obsolete self.obsolete_counts['cmat_total'] += 1 if self.eval_latest_mappings[efo_id]['is_obsolete']: self.obsolete_counts['cmat_obsolete'] += 1 + # Don't add to the set of annotations if obsolete + continue - for cv_id in existing_efo_ids: + for cv_id in existing_current_efo_ids: # Attempt to match an ID in ClinVar based on synonyms - if our ID is in the list of synonyms for # a ClinVar ID (or vice versa), we use the synonymous ClinVar ID for comparison. if (efo_id in self.eval_xref_mappings[cv_id]['synonyms'] or cv_id in self.eval_latest_mappings[efo_id]['synonyms']): - annotated_efo_ids.add(cv_id) + annotated_current_efo_ids.add(cv_id) - # Similarly attempt to match based on neighbors - # TODO include direction (parents vs. children) somehow - if (efo_id in self.eval_xref_mappings[cv_id]['parents'] - or efo_id in self.eval_xref_mappings[cv_id]['children']): - annotated_efo_ids.add(cv_id) # If didn't find anything, just use our ID, which will count as not matching. - if not annotated_efo_ids: - annotated_efo_ids.add(efo_id) + if not annotated_current_efo_ids: + annotated_current_efo_ids.add(efo_id) - self.trait_metrics.count_and_score(cv_set=existing_efo_ids, cmat_set=annotated_efo_ids) + self.trait_metrics.count_and_score(cv_set=existing_current_efo_ids, cmat_set=annotated_current_efo_ids) # Output mismatches for manual inspection - if existing_efo_ids and annotated_efo_ids and len(existing_efo_ids & annotated_efo_ids) == 0: - self.mismatches_file.write(f"{record.accession}\t{','.join(existing_efo_ids)}\t{','.join(annotated_efo_ids)}\n") + if existing_current_efo_ids and annotated_current_efo_ids and len(existing_current_efo_ids & annotated_current_efo_ids) == 0: + self.mismatches_file.write(f"{record.accession}\t{','.join(existing_current_efo_ids)}\t{','.join(annotated_current_efo_ids)}\n") def report(self): print('\nOverall counts (RCVs):') @@ -150,6 +178,21 @@ def report(self): self.gene_metrics.report() print('\nFunctional consequences:') self.conseq_metrics.report() + + print('\nBy variant type:') + print('\n\tSimple (genes):') + self.simple_variant_metrics[0].report() + print('\n\tSimple (consequences):') + self.simple_variant_metrics[1].report() + print('\n\tRepeat (genes):') + self.repeat_variant_metrics[0].report() + print('\n\tRepeat (consequences):') + self.repeat_variant_metrics[1].report() + print('\n\tComplex (genes):') + self.complex_variant_metrics[0].report() + print('\n\tComplex (consequences):') + self.complex_variant_metrics[1].report() + if self.eval_xref_mappings: print('\nTrait mappings:') self.trait_metrics.report() @@ -193,8 +236,8 @@ class EnsemblAnnotatedClinVarMeasure(ClinVarRecordMeasure): def add_ensembl_annotations(self, consequences): consequence_elts = [] for consequence_attributes in consequences: - attr_set_elt = ET.Element('AttributeSet') - attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence', 'providedBy': PROCESSOR}) + attr_set_elt = ET.Element('AttributeSet', attrib={'providedBy': PROCESSOR}) + attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence'}) attribute_elt.text = consequence_attributes.so_term.so_name.replace('_', ' ') so_elt = ET.Element('XRef', attrib={'ID': self.format_so_term(consequence_attributes.so_term), 'DB': 'Sequence Ontology'}) @@ -238,19 +281,25 @@ def load_evaluation_xref_mappings(input_path): with open(input_path) as input_file: for line in input_file: cols = line.strip().split('\t') - if len(cols) != 5: + if len(cols) == 5: + mapping[cols[0]] = { + 'is_obsolete': cols[1] == 'True', + 'synonyms': string_to_set(cols[2]), + 'parents': string_to_set(cols[3]), + 'children': string_to_set(cols[4]) + } + elif len(cols) == 3: + mapping[cols[0]] = { + 'is_obsolete': cols[1] == 'True', + 'synonyms': string_to_set(cols[2]) + } + else: continue - mapping[cols[0]] = { - 'is_obsolete': cols[1] == 'True', - 'synonyms': string_to_set(cols[2]), - 'parents': string_to_set(cols[3]), - 'children': string_to_set(cols[4]) - } return mapping def string_to_set(s): - return set(re.sub(r"{|}|'", '', s).split(', ')) + return set(x for x in re.sub(r"{|}|'", '', s).split(', ') if x) def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file, diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 8354033a..94e980ff 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -149,7 +149,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele # origins, (2) EFO mappings, and (3) genes where the variant has effect. grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins) - consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings) + consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings) grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names, string_to_efo_mappings) @@ -290,7 +290,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict): # expansion variants will be fed to VEP and will receive incorrect consequence annotations. By using RCV pairing # first, we prioritise results of the variant expansion pipeline over the general VEP pipeline. if clinvar_record_measure.clinvar_record.accession in consequence_type_dict: - return consequence_type_dict[clinvar_record_measure.clinvar_record.accession] + return consequence_type_dict[clinvar_record_measure.clinvar_record.accession], 'REPEAT' # If RCV is not present in the consequences file, pair using full variant description (CHROM:POS:REF:ALT) if clinvar_record_measure.has_complete_coordinates: @@ -302,7 +302,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict): if IUPAC_AMBIGUOUS_SEQUENCE.search(clinvar_record_measure.vcf_ref + clinvar_record_measure.vcf_alt): logger.warning(f'Observed variant with non-ACGT allele sequences: {coord_id}') if coord_id in consequence_type_dict: - return consequence_type_dict[coord_id] + return consequence_type_dict[coord_id], 'SIMPLE' # If there's also no complete coordinates, pair using HGVS if clinvar_record_measure.preferred_current_hgvs: @@ -311,12 +311,12 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict): consequences = consequence_type_dict[hgvs_id] if len(consequences) > MAX_TARGET_GENES: logger.warning(f'Skipping variant {hgvs_id} with {len(consequences)} target genes') - return [] - return consequences + return [], 'NONE' + return consequences, 'COMPLEX' # Previously, the pairing was also attempted based on rsID and nsvID. This is not reliable because of lack of allele # specificity, and has been removed. - return [] + return [], 'NONE' def write_string_list_to_file(string_list, filename): diff --git a/cmat/output_generation/evaluation/ols_utils.py b/cmat/output_generation/evaluation/ols_utils.py index 41e47a2f..b63e9050 100644 --- a/cmat/output_generation/evaluation/ols_utils.py +++ b/cmat/output_generation/evaluation/ols_utils.py @@ -1,6 +1,8 @@ import logging from functools import lru_cache +from requests import RequestException + from cmat.clinvar_xml_io.ontology_uri import OntologyUri from cmat.trait_mapping.utils import json_request from cmat.trait_mapping.ols import build_ols_query @@ -26,14 +28,18 @@ def fetch_eval_data(*, db_iden=None, uri=None, include_neighbors=False): return None curie = OntologyUri.uri_to_curie(ontology_uri) - # Defaults to return if OLS query fails + # Defaults to return if OLS query fails or no term in EFO is_obsolete = False - synonyms = {curie} + synonyms = {} parents = {} children = {} url = build_ols_query(ontology_uri) - json_response = json_request(url) + try: + json_response = json_request(url) + except RequestException: + logger.warning(f'OLS4 error for {url}, trying OLS3...') + json_response = json_request(url.replace('/ols4/', '/ols/')) if json_response and '_embedded' in json_response: for term in json_response['_embedded']['terms']: # Get only EFO terms (even if imported) diff --git a/cmat/trait_mapping/main.py b/cmat/trait_mapping/main.py index e4de03c9..9e9da7e4 100644 --- a/cmat/trait_mapping/main.py +++ b/cmat/trait_mapping/main.py @@ -1,6 +1,7 @@ import csv import logging import multiprocessing +from collections import Counter from cmat.clinvar_xml_io import ClinVarTrait from cmat.trait_mapping.output import output_trait @@ -121,7 +122,9 @@ def process_traits(traits_filepath, output_mappings_filepath, output_curation_fi ] logger.info('Writing output with the processed traits') + finished_source_counts = Counter() for trait in processed_trait_list: - output_trait(trait, mapping_writer, curation_writer) + output_trait(trait, mapping_writer, curation_writer, finished_source_counts) logger.info('Finished processing trait names') + logger.info(f'Source counts for finished mappings: {finished_source_counts}') diff --git a/cmat/trait_mapping/output.py b/cmat/trait_mapping/output.py index 609b001b..34b7fe10 100644 --- a/cmat/trait_mapping/output.py +++ b/cmat/trait_mapping/output.py @@ -1,16 +1,28 @@ import csv +from collections import Counter from cmat.trait_mapping.trait import Trait -def output_trait_mapping(trait: Trait, mapping_writer: csv.writer): +def output_trait_mapping(trait: Trait, mapping_writer: csv.writer, finished_source_counts: Counter = None): """ Write any finished ontology mappings for a trait to a csv file writer. :param trait: A trait with finished ontology mappings in finished_mapping_set :param mapping_writer: A csv.writer to write the finished mappings + :param finished_source_counts: Optional Counter to count sources of finished mappings """ for ontology_entry in trait.finished_mapping_set: + # Need the corresponding Zooma result + zooma_mapping = None + for zooma_result in trait.zooma_result_list: + for zm in zooma_result.mapping_list: + if (zm.in_efo and zm.is_current and ontology_entry.uri == zm.uri + and ontology_entry.label == zm.ontology_label): + zooma_mapping = zm + break + if zooma_mapping and finished_source_counts: + finished_source_counts[zooma_mapping.source.lower()] += 1 mapping_writer.writerow([trait.name, ontology_entry.uri, ontology_entry.label]) @@ -55,7 +67,7 @@ def output_for_curation(trait: Trait, curation_writer: csv.writer): curation_writer.writerow(output_row) -def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv.writer): +def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv.writer, finished_source_counts: Counter): """ Output finished ontology mappings of a trait, or non-finished mappings (if any) for curation. @@ -64,6 +76,6 @@ def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv. :param curation_writer: A csv.writer to write non-finished ontology mappings for manual curation """ if trait.is_finished: - output_trait_mapping(trait, mapping_writer) + output_trait_mapping(trait, mapping_writer, finished_source_counts) else: output_for_curation(trait, curation_writer) diff --git a/setup.py b/setup.py index 0fcd3168..b2b6ff18 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def get_requires(): long_description = fh.read() setup(name='cmat', - version='3.0.1', + version='3.0.2', author_email='opentargets-clinvar@ebi.ac.uk', url='https://github.com/EBIvariation/eva-opentargets', packages=find_packages(), diff --git a/tests/output_generation/resources/expected_annotation_output.xml.gz b/tests/output_generation/resources/expected_annotation_output.xml.gz index f793609a..11e586b4 100644 Binary files a/tests/output_generation/resources/expected_annotation_output.xml.gz and b/tests/output_generation/resources/expected_annotation_output.xml.gz differ diff --git a/tests/output_generation/test_annotated_clinvar.py b/tests/output_generation/test_annotated_clinvar.py index 4f477cda..7abb05ff 100644 --- a/tests/output_generation/test_annotated_clinvar.py +++ b/tests/output_generation/test_annotated_clinvar.py @@ -2,11 +2,17 @@ import os import re -from cmat.output_generation.annotated_clinvar import generate_annotated_clinvar_xml +from cmat.output_generation.annotated_clinvar import generate_annotated_clinvar_xml, string_to_set resources_dir = os.path.join(os.path.dirname(__file__), 'resources') +def test_string_to_set(): + assert string_to_set("{'HP:0002269'}") == {'HP:0002269'} + assert string_to_set("{'Orphanet:49382', 'MONDO:0018852'}") == {'Orphanet:49382', 'MONDO:0018852'} + assert string_to_set('{}') == set() + + def test_generate_annotated_xml(): input_file = os.path.join(resources_dir, 'test_annotation_input.xml.gz') efo_mapping_file = os.path.join(resources_dir, 'string_to_ontology_mappings.tsv') diff --git a/tests/output_generation/test_clinvar_to_evidence_strings.py b/tests/output_generation/test_clinvar_to_evidence_strings.py index 9e337252..52f126fb 100644 --- a/tests/output_generation/test_clinvar_to_evidence_strings.py +++ b/tests/output_generation/test_clinvar_to_evidence_strings.py @@ -124,18 +124,18 @@ def setup_class(cls): def test_get_consequence_types(self): assert clinvar_to_evidence_strings.get_consequence_types(self.test_crm, self.consequence_type_dict)[ - 0] == CT.ConsequenceType('ENSG00000139988', CT.SoTerm('missense_variant')), '' - assert clinvar_to_evidence_strings.get_consequence_types(self.test_crm, {}) == [] + 0][0] == CT.ConsequenceType('ENSG00000139988', CT.SoTerm('missense_variant')), '' + assert clinvar_to_evidence_strings.get_consequence_types(self.test_crm, {})[0] == [] def test_structural_variant_consequences(self): structural_crm = config.get_test_clinvar_record('test_structural_record.xml.gz').measure consequences = [CT.ConsequenceType('ENSG00000075151', CT.SoTerm('splice_polypyrimidine_tract_variant'))] consequence_dict = {structural_crm.preferred_current_hgvs.text: consequences} - assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, consequence_dict) == consequences + assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, consequence_dict) == (consequences, 'COMPLEX') # don't get consequences if there are more than MAX_TARGET_GENES long_consequence_dict = {structural_crm.preferred_current_hgvs.text: consequences * (MAX_TARGET_GENES+1)} - assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, long_consequence_dict) == [] + assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, long_consequence_dict) == ([], 'NONE') class TestGenerateEvidenceStringTest: