Merge pull request #394 from apriltuesday/EVA-3327_2

Final counts for CMAT
EBIvariation · Jul 31, 2023 · e73d993 · e73d993
2 parents 5bbf36d + 8a9ce25
commit e73d993
Show file tree

Hide file tree

Showing 11 changed files with 123 additions and 47 deletions.
diff --git a/bin/evaluation/map_xrefs.py b/bin/evaluation/map_xrefs.py
@@ -17,7 +17,7 @@ def main(clinvar_xml, output_file):
     traits = list(traits)
     process_pool = multiprocessing.Pool(processes=24)
     annotated_traits = [
-        process_pool.apply(fetch_eval_data, kwds={'db_iden': (db, iden), 'include_neighbors': True})
+        process_pool.apply(fetch_eval_data, kwds={'db_iden': (db, iden), 'include_neighbors': False})
         for db, iden in traits
     ]
     with open(output_file, 'w+') as f:

diff --git a/cmat/clinvar_xml_io/clinvar_measure.py b/cmat/clinvar_xml_io/clinvar_measure.py
@@ -89,10 +89,10 @@ def nsv_id(self):
     def existing_so_terms(self):
         all_so_terms = set()
         for attr_elem in find_elements(self.measure_xml, './AttributeSet'):
-            if find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]'):
+            if ('providedBy' not in attr_elem.attrib
+                    and find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]')):
                 for so_elem in find_elements(attr_elem, './XRef[@DB="Sequence Ontology"]'):
-                    if 'providedBy' not in so_elem.attrib:
-                        all_so_terms.add(so_elem.attrib['ID'])
+                    all_so_terms.add(so_elem.attrib['ID'])
         return all_so_terms
 
     @cached_property

diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py
@@ -31,6 +31,11 @@ def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings
         self.obsolete_counts = {}
         self.gene_metrics = None
         self.conseq_metrics = None
+        # Gene and consequence metrics, split by variant category
+        self.simple_variant_metrics = None
+        self.repeat_variant_metrics = None
+        self.complex_variant_metrics = None
+
         self.trait_metrics = None
         self.mismatches_file = None
 
@@ -40,6 +45,7 @@ def __iter__(self):
             'total': 0,
             'has_supported_measure': 0,
             'has_supported_trait': 0,
+            'both_measure_and_trait': 0
         }
         self.obsolete_counts = {
             'cv_total': 0,    # total EFO xrefs used by ClinVar
@@ -49,6 +55,11 @@ def __iter__(self):
         }
         self.gene_metrics = SetComparisonMetrics()
         self.conseq_metrics = SetComparisonMetrics()
+        # First counter is genes, second is consequences
+        self.simple_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics())
+        self.repeat_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics())
+        self.complex_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics())
+
         self.trait_metrics = SetComparisonMetrics()
         self.mismatches_file = open('mismatches.tsv', 'w+')
         self.mismatches_file.write('RCV\tCV\tCMAT\n')
@@ -62,6 +73,8 @@ def __iter__(self):
         self.gene_metrics.finalise()
         self.conseq_metrics.finalise()
         self.trait_metrics.finalise()
+        for metrics in self.simple_variant_metrics + self.repeat_variant_metrics + self.complex_variant_metrics:
+            metrics.finalise()
         self.mismatches_file.close()
 
     def annotate(self, record):
@@ -74,9 +87,11 @@ def annotate(self, record):
         if record.traits_with_valid_names:
             self.overall_counts['has_supported_trait'] += 1
             self.annotate_and_count_traits(record)
+        if record.measure and record.traits_with_valid_names:
+            self.overall_counts['both_measure_and_trait'] += 1
 
     def annotate_and_count_measure(self, record):
-        consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings)
+        consequence_types, variant_category = get_consequence_types(record.measure, self.variant_to_gene_mappings)
         record.measure.add_ensembl_annotations(consequence_types)
 
         annotated_genes = {ct.ensembl_gene_id for ct in consequence_types if ct.ensembl_gene_id}
@@ -87,6 +102,18 @@ def annotate_and_count_measure(self, record):
             existing_ensembl_ids = self.eval_gene_mappings.get(record.accession, [])
             self.gene_metrics.count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
             self.conseq_metrics.count_and_score(cv_set=record.measure.existing_so_terms, cmat_set=annotated_conseqs)
+            if variant_category == 'SIMPLE':
+                self.simple_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
+                self.simple_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms,
+                                                               cmat_set=annotated_conseqs)
+            elif variant_category == 'REPEAT':
+                self.repeat_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
+                self.repeat_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms,
+                                                               cmat_set=annotated_conseqs)
+            elif variant_category == 'COMPLEX':
+                self.complex_variant_metrics[0].count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
+                self.complex_variant_metrics[1].count_and_score(cv_set=record.measure.existing_so_terms,
+                                                               cmat_set=annotated_conseqs)
 
     def annotate_and_count_traits(self, record):
         for trait in record.traits_with_valid_names:
@@ -107,40 +134,41 @@ def annotate_and_count_traits(self, record):
 
             # Evaluation
             if self.eval_xref_mappings and self.eval_latest_mappings:
+                existing_current_efo_ids = set()
                 for cv_id in existing_efo_ids:
                     # Check whether existing ID is obsolete
                     self.obsolete_counts['cv_total'] += 1
                     if self.eval_xref_mappings[cv_id]['is_obsolete']:
                         self.obsolete_counts['cv_obsolete'] += 1
+                    # Only record current EFO-contained IDs for comparison
+                    elif self.eval_xref_mappings[cv_id]['synonyms']:
+                        existing_current_efo_ids.add(cv_id)
 
-                annotated_efo_ids = set()
+                annotated_current_efo_ids = set()
                 for efo_id in efo_ids:
 
                     # Check whether annotated ID is obsolete
                     self.obsolete_counts['cmat_total'] += 1
                     if self.eval_latest_mappings[efo_id]['is_obsolete']:
                         self.obsolete_counts['cmat_obsolete'] += 1
+                        # Don't add to the set of annotations if obsolete
+                        continue
 
-                    for cv_id in existing_efo_ids:
+                    for cv_id in existing_current_efo_ids:
                         # Attempt to match an ID in ClinVar based on synonyms - if our ID is in the list of synonyms for
                         # a ClinVar ID (or vice versa), we use the synonymous ClinVar ID for comparison.
                         if (efo_id in self.eval_xref_mappings[cv_id]['synonyms']
                                 or cv_id in self.eval_latest_mappings[efo_id]['synonyms']):
-                            annotated_efo_ids.add(cv_id)
+                            annotated_current_efo_ids.add(cv_id)
 
-                        # Similarly attempt to match based on neighbors
-                        # TODO include direction (parents vs. children) somehow
-                        if (efo_id in self.eval_xref_mappings[cv_id]['parents']
-                                or efo_id in self.eval_xref_mappings[cv_id]['children']):
-                            annotated_efo_ids.add(cv_id)
                     # If didn't find anything, just use our ID, which will count as not matching.
-                    if not annotated_efo_ids:
-                        annotated_efo_ids.add(efo_id)
+                    if not annotated_current_efo_ids:
+                        annotated_current_efo_ids.add(efo_id)
 
-                self.trait_metrics.count_and_score(cv_set=existing_efo_ids, cmat_set=annotated_efo_ids)
+                self.trait_metrics.count_and_score(cv_set=existing_current_efo_ids, cmat_set=annotated_current_efo_ids)
                 # Output mismatches for manual inspection
-                if existing_efo_ids and annotated_efo_ids and len(existing_efo_ids & annotated_efo_ids) == 0:
-                    self.mismatches_file.write(f"{record.accession}\t{','.join(existing_efo_ids)}\t{','.join(annotated_efo_ids)}\n")
+                if existing_current_efo_ids and annotated_current_efo_ids and len(existing_current_efo_ids & annotated_current_efo_ids) == 0:
+                    self.mismatches_file.write(f"{record.accession}\t{','.join(existing_current_efo_ids)}\t{','.join(annotated_current_efo_ids)}\n")
 
     def report(self):
         print('\nOverall counts (RCVs):')
@@ -150,6 +178,21 @@ def report(self):
             self.gene_metrics.report()
             print('\nFunctional consequences:')
             self.conseq_metrics.report()
+
+            print('\nBy variant type:')
+            print('\n\tSimple (genes):')
+            self.simple_variant_metrics[0].report()
+            print('\n\tSimple (consequences):')
+            self.simple_variant_metrics[1].report()
+            print('\n\tRepeat (genes):')
+            self.repeat_variant_metrics[0].report()
+            print('\n\tRepeat (consequences):')
+            self.repeat_variant_metrics[1].report()
+            print('\n\tComplex (genes):')
+            self.complex_variant_metrics[0].report()
+            print('\n\tComplex (consequences):')
+            self.complex_variant_metrics[1].report()
+
         if self.eval_xref_mappings:
             print('\nTrait mappings:')
             self.trait_metrics.report()
@@ -193,8 +236,8 @@ class EnsemblAnnotatedClinVarMeasure(ClinVarRecordMeasure):
     def add_ensembl_annotations(self, consequences):
         consequence_elts = []
         for consequence_attributes in consequences:
-            attr_set_elt = ET.Element('AttributeSet')
-            attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence', 'providedBy': PROCESSOR})
+            attr_set_elt = ET.Element('AttributeSet', attrib={'providedBy': PROCESSOR})
+            attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence'})
             attribute_elt.text = consequence_attributes.so_term.so_name.replace('_', ' ')
             so_elt = ET.Element('XRef', attrib={'ID': self.format_so_term(consequence_attributes.so_term),
                                                 'DB': 'Sequence Ontology'})
@@ -238,19 +281,25 @@ def load_evaluation_xref_mappings(input_path):
     with open(input_path) as input_file:
         for line in input_file:
             cols = line.strip().split('\t')
-            if len(cols) != 5:
+            if len(cols) == 5:
+                mapping[cols[0]] = {
+                    'is_obsolete': cols[1] == 'True',
+                    'synonyms': string_to_set(cols[2]),
+                    'parents': string_to_set(cols[3]),
+                    'children': string_to_set(cols[4])
+                }
+            elif len(cols) == 3:
+                mapping[cols[0]] = {
+                    'is_obsolete': cols[1] == 'True',
+                    'synonyms': string_to_set(cols[2])
+                }
+            else:
                 continue
-            mapping[cols[0]] = {
-                'is_obsolete': cols[1] == 'True',
-                'synonyms': string_to_set(cols[2]),
-                'parents': string_to_set(cols[3]),
-                'children': string_to_set(cols[4])
-            }
     return mapping
 
 
 def string_to_set(s):
-    return set(re.sub(r"{|}|'", '', s).split(', '))
+    return set(x for x in re.sub(r"{|}|'", '', s).split(', ') if x)
 
 
 def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file,

diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -149,7 +149,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
         # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
         # origins, (2) EFO mappings, and (3) genes where the variant has effect.
         grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
-        consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
+        consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
         grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
                                                          string_to_efo_mappings)
 
@@ -290,7 +290,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict):
     # expansion variants will be fed to VEP and will receive incorrect consequence annotations. By using RCV pairing
     # first, we prioritise results of the variant expansion pipeline over the general VEP pipeline.
     if clinvar_record_measure.clinvar_record.accession in consequence_type_dict:
-        return consequence_type_dict[clinvar_record_measure.clinvar_record.accession]
+        return consequence_type_dict[clinvar_record_measure.clinvar_record.accession], 'REPEAT'
 
     # If RCV is not present in the consequences file, pair using full variant description (CHROM:POS:REF:ALT)
     if clinvar_record_measure.has_complete_coordinates:
@@ -302,7 +302,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict):
         if IUPAC_AMBIGUOUS_SEQUENCE.search(clinvar_record_measure.vcf_ref + clinvar_record_measure.vcf_alt):
             logger.warning(f'Observed variant with non-ACGT allele sequences: {coord_id}')
         if coord_id in consequence_type_dict:
-            return consequence_type_dict[coord_id]
+            return consequence_type_dict[coord_id], 'SIMPLE'
 
     # If there's also no complete coordinates, pair using HGVS
     if clinvar_record_measure.preferred_current_hgvs:
@@ -311,12 +311,12 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict):
             consequences = consequence_type_dict[hgvs_id]
             if len(consequences) > MAX_TARGET_GENES:
                 logger.warning(f'Skipping variant {hgvs_id} with {len(consequences)} target genes')
-                return []
-            return consequences
+                return [], 'NONE'
+            return consequences, 'COMPLEX'
 
     # Previously, the pairing was also attempted based on rsID and nsvID. This is not reliable because of lack of allele
     # specificity, and has been removed.
-    return []
+    return [], 'NONE'
 
 
 def write_string_list_to_file(string_list, filename):

diff --git a/cmat/output_generation/evaluation/ols_utils.py b/cmat/output_generation/evaluation/ols_utils.py
@@ -1,6 +1,8 @@
 import logging
 from functools import lru_cache
 
+from requests import RequestException
+
 from cmat.clinvar_xml_io.ontology_uri import OntologyUri
 from cmat.trait_mapping.utils import json_request
 from cmat.trait_mapping.ols import build_ols_query
@@ -26,14 +28,18 @@ def fetch_eval_data(*, db_iden=None, uri=None, include_neighbors=False):
         return None
     curie = OntologyUri.uri_to_curie(ontology_uri)
 
-    # Defaults to return if OLS query fails
+    # Defaults to return if OLS query fails or no term in EFO
     is_obsolete = False
-    synonyms = {curie}
+    synonyms = {}
     parents = {}
     children = {}
 
     url = build_ols_query(ontology_uri)
-    json_response = json_request(url)
+    try:
+        json_response = json_request(url)
+    except RequestException:
+        logger.warning(f'OLS4 error for {url}, trying OLS3...')
+        json_response = json_request(url.replace('/ols4/', '/ols/'))
     if json_response and '_embedded' in json_response:
         for term in json_response['_embedded']['terms']:
             # Get only EFO terms (even if imported)

diff --git a/cmat/trait_mapping/main.py b/cmat/trait_mapping/main.py
@@ -1,6 +1,7 @@
 import csv
 import logging
 import multiprocessing
+from collections import Counter
 
 from cmat.clinvar_xml_io import ClinVarTrait
 from cmat.trait_mapping.output import output_trait
@@ -121,7 +122,9 @@ def process_traits(traits_filepath, output_mappings_filepath, output_curation_fi
         ]
 
         logger.info('Writing output with the processed traits')
+        finished_source_counts = Counter()
         for trait in processed_trait_list:
-            output_trait(trait, mapping_writer, curation_writer)
+            output_trait(trait, mapping_writer, curation_writer, finished_source_counts)
 
     logger.info('Finished processing trait names')
+    logger.info(f'Source counts for finished mappings: {finished_source_counts}')
diff --git a/cmat/trait_mapping/output.py b/cmat/trait_mapping/output.py
@@ -1,16 +1,28 @@
 import csv
+from collections import Counter
 
 from cmat.trait_mapping.trait import Trait
 
 
-def output_trait_mapping(trait: Trait, mapping_writer: csv.writer):
+def output_trait_mapping(trait: Trait, mapping_writer: csv.writer, finished_source_counts: Counter = None):
     """
     Write any finished ontology mappings for a trait to a csv file writer.
 
     :param trait: A trait with finished ontology mappings in finished_mapping_set
     :param mapping_writer: A csv.writer to write the finished mappings
+    :param finished_source_counts: Optional Counter to count sources of finished mappings
     """
     for ontology_entry in trait.finished_mapping_set:
+        # Need the corresponding Zooma result
+        zooma_mapping = None
+        for zooma_result in trait.zooma_result_list:
+            for zm in zooma_result.mapping_list:
+                if (zm.in_efo and zm.is_current and ontology_entry.uri == zm.uri
+                        and ontology_entry.label == zm.ontology_label):
+                    zooma_mapping = zm
+                    break
+        if zooma_mapping and finished_source_counts:
+            finished_source_counts[zooma_mapping.source.lower()] += 1
         mapping_writer.writerow([trait.name, ontology_entry.uri, ontology_entry.label])
 
 
@@ -55,7 +67,7 @@ def output_for_curation(trait: Trait, curation_writer: csv.writer):
     curation_writer.writerow(output_row)
 
 
-def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv.writer):
+def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv.writer, finished_source_counts: Counter):
     """
     Output finished ontology mappings of a trait, or non-finished mappings (if any) for curation.
 
@@ -64,6 +76,6 @@ def output_trait(trait: Trait, mapping_writer: csv.writer, curation_writer: csv.
     :param curation_writer: A csv.writer to write non-finished ontology mappings for manual curation
     """
     if trait.is_finished:
-        output_trait_mapping(trait, mapping_writer)
+        output_trait_mapping(trait, mapping_writer, finished_source_counts)
     else:
         output_for_curation(trait, curation_writer)
diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@ def get_requires():
     long_description = fh.read()
 
 setup(name='cmat',
-      version='3.0.1',
+      version='3.0.2',
       author_email='opentargets-clinvar@ebi.ac.uk',
       url='https://github.com/EBIvariation/eva-opentargets',
       packages=find_packages(),

diff --git a/tests/output_generation/resources/expected_annotation_output.xml.gz b/tests/output_generation/resources/expected_annotation_output.xml.gz
diff --git a/tests/output_generation/test_annotated_clinvar.py b/tests/output_generation/test_annotated_clinvar.py
@@ -2,11 +2,17 @@
 import os
 import re
 
-from cmat.output_generation.annotated_clinvar import generate_annotated_clinvar_xml
+from cmat.output_generation.annotated_clinvar import generate_annotated_clinvar_xml, string_to_set
 
 resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
 
 
+def test_string_to_set():
+    assert string_to_set("{'HP:0002269'}") == {'HP:0002269'}
+    assert string_to_set("{'Orphanet:49382', 'MONDO:0018852'}") == {'Orphanet:49382', 'MONDO:0018852'}
+    assert string_to_set('{}') == set()
+
+
 def test_generate_annotated_xml():
     input_file = os.path.join(resources_dir, 'test_annotation_input.xml.gz')
     efo_mapping_file = os.path.join(resources_dir, 'string_to_ontology_mappings.tsv')