diff --git a/bin/evaluation/map_genes.py b/bin/evaluation/map_genes.py
new file mode 100644
index 00000000..fcd6342c
--- /dev/null
+++ b/bin/evaluation/map_genes.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import argparse
+
+import pandas as pd
+
+from consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info
+from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
+
+
+def load_clinvar_data(clinvar_xml):
+    """Load ClinVar data, process for gene symbols and HGNC IDs, and return it as a Pandas dataframe.
+     Modified from similar functionality in the repeat expansion pipeline."""
+    variant_data = []  # To populate the return dataframe (see columns below)
+    for clinvar_record in clinvar_xml_io.ClinVarDataset(clinvar_xml):
+        # Skip a record if it does not contain variant information
+        if not clinvar_record.measure:
+            continue
+        measure = clinvar_record.measure
+
+        # Extract gene symbol(s). Here and below, dashes are sometimes assigned to be compatible with the variant
+        # summary format which was used previously.
+        gene_symbols = measure.preferred_gene_symbols
+        if not gene_symbols:
+            gene_symbols = ['-']
+
+        # Extract HGNC ID
+        hgnc_ids = measure.hgnc_ids
+        hgnc_id = hgnc_ids[0] if len(hgnc_ids) == 1 and len(gene_symbols) == 1 else '-'
+
+        # Append data strings (including unused columns for compatibility)
+        for gene_symbol in gene_symbols:
+            variant_data.append([
+                measure.preferred_or_other_name,
+                clinvar_record.accession,
+                gene_symbol,
+                hgnc_id,
+                None,
+                None
+            ])
+
+    variants = pd.DataFrame(variant_data, columns=('Name',
+                                                   'RCVaccession',
+                                                   'GeneSymbol',
+                                                   'HGNC_ID',
+                                                   'TranscriptID',
+                                                   'RepeatType'))
+
+    # Since the same record can have coordinates in multiple builds, it can be repeated. Remove duplicates
+    variants = variants.drop_duplicates()
+    return variants
+
+
+def main(clinvar_xml, output_file):
+    """Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV."""
+    variants = load_clinvar_data(clinvar_xml)
+    annotated_variants = annotate_ensembl_gene_info(variants)
+    annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Script to convert HGNC IDs and gene symbols in ClinVar to Ensembl')
+    parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file')
+    parser.add_argument('--output-file', required=True, help='File to output dataframe')
+    args = parser.parse_args()
+    main(args.clinvar_xml, args.output_file)
diff --git a/bin/evaluation/map_xrefs.py b/bin/evaluation/map_xrefs.py
new file mode 100644
index 00000000..55bb8066
--- /dev/null
+++ b/bin/evaluation/map_xrefs.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import multiprocessing
+from functools import lru_cache
+
+from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
+from eva_cttv_pipeline.trait_mapping.ols import build_ols_query
+from eva_cttv_pipeline.trait_mapping.utils import json_request
+
+
+@lru_cache
+def get_synonyms(db, iden):
+    """Find synonyms (replacement terms or exact matches) for this ontology identifier using OLS."""
+    synonyms = set()
+    ontology_uri = OntologyUri(iden, db)
+    url = build_ols_query(str(ontology_uri))
+    json_response = json_request(url)
+    if json_response and '_embedded' in json_response:
+        for term in json_response['_embedded']['terms']:
+            # Get only EFO terms (even if imported)
+            if term['ontology_name'] == 'efo':
+                synonyms.add(term['iri'])
+                # Check whether current term is obsolete
+                if term['is_obsolete'] and term['term_replaced_by']:
+                    synonyms.add(term['term_replaced_by'])
+                # Also add exact matches
+                if 'exactMatch' in term['annotation']:
+                    synonyms.update(term['annotation']['exactMatch'])
+
+        # Synonyms contains current EFO-included URIs, convert to DB:ID style
+        synonyms = {OntologyUri.uri_to_curie(s) for s in synonyms}
+        # Filter out Nones
+        synonyms = {s for s in synonyms if s is not None}
+
+    if synonyms:
+        return ontology_uri.curie, synonyms
+    # If no synonyms, just return the original identifier as is
+    return ontology_uri.curie, {ontology_uri.curie}
+
+
+def main(clinvar_xml, output_file):
+    """Load ClinVar XML, map trait xrefs identifiers to synonyms in OLS, and dump results to TSV."""
+    traits = set()
+    for record in clinvar_xml_io.ClinVarDataset(clinvar_xml):
+        for trait in record.traits_with_valid_names:
+            traits.update([(db, iden) for db, iden, _ in trait.current_efo_aligned_xrefs])
+
+    traits = list(traits)
+    process_pool = multiprocessing.Pool(processes=24)
+    annotated_traits = [
+        process_pool.apply(get_synonyms, args=(db, iden))
+        for db, iden in traits
+    ]
+    with open(output_file, 'w+') as f:
+        csv.writer(f, delimiter="\t").writerows(annotated_traits)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Script to map trait xrefs in ClinVar to synonyms in OLS')
+    parser.add_argument('--clinvar-xml', required=True, help='ClinVar XML dump file')
+    parser.add_argument('--output-file', required=True, help='File to output dataframe')
+    args = parser.parse_args()
+    main(args.clinvar_xml, args.output_file)
diff --git a/bin/generate_annotated_xml.py b/bin/generate_annotated_xml.py
index 9c7f9c25..83bf3c75 100644
--- a/bin/generate_annotated_xml.py
+++ b/bin/generate_annotated_xml.py
@@ -8,10 +8,12 @@
 parser.add_argument('--efo-mapping',  help='Disease string to ontology mappings',    required=True)
 parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
 parser.add_argument('--output-xml',   help='Output XML file',                        required=True)
+parser.add_argument('--eval-gene-file', help='Gene mappings for evaluation',         required=False)
+parser.add_argument('--eval-xref-file', help='Trait xref mappings for evaluation',   required=False)
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     generate_annotated_clinvar_xml(
         clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
-        output_xml_file=args.output_xml)
+        output_xml_file=args.output_xml, eval_gene_file=args.eval_gene_file, eval_xref_file=args.eval_xref_file)
diff --git a/bin/trait_mapping/create_efo_table.py b/bin/trait_mapping/create_efo_table.py
index 561347c3..3df9dd79 100644
--- a/bin/trait_mapping/create_efo_table.py
+++ b/bin/trait_mapping/create_efo_table.py
@@ -4,6 +4,9 @@
 import re
 import requests
 
+from requests import HTTPError
+from retry import retry
+
 # Name of ontology in OLS url, e. g. https://www.ebi.ac.uk/ols/ontologies/ordo/terms?iri=...
 ontology_to_ols = {
     'HP': 'hp',
@@ -49,15 +52,17 @@ def uri_to_curie(uri):
     return uri.split('/')[-1].replace('#', '').replace('_', ':')
 
 
+@retry(HTTPError, tries=4, delay=2, backoff=1.2, jitter=(1, 3))
 def get_cross_references(curie):
     """Queries OxO to return the list of cross-references for a given term curie."""
     url = oxo_url_template.format(curie=curie)
-    response = requests.get(url).json()
-    if '_embedded' not in response:
-        print('Warning: OxO error for term {}. No cross-links will be available for this term. '
-              'See https://github.com/EBISPOT/OXO/issues/26'.format(curie))
-        return []
-    mappings = response['_embedded']['searchResults'][0]['mappingResponseList']
+    response = requests.get(url)
+    response.raise_for_status()
+    json_response = response.json()
+    if '_embedded' not in json_response:
+        raise ValueError('Warning: OxO error for term {}. No cross-links will be available for this term. '
+                         'See https://github.com/EBISPOT/OXO/issues/26'.format(curie))
+    mappings = json_response['_embedded']['searchResults'][0]['mappingResponseList']
     return [m['curie'] for m in mappings]
 
 
@@ -79,7 +84,12 @@ def get_ols_details(ontology, term):
     # Cross-references
     term_curie = uri_to_curie(term)
     xrefs = {}
-    for x in get_cross_references(term_curie):
+    try:
+        oxo_xrefs = get_cross_references(term_curie)
+    except (HTTPError, ValueError) as e:
+        print('Warning: OxO error for term {}. No cross-links will be available for this term.'.format(term_curie))
+        oxo_xrefs = []
+    for x in oxo_xrefs:
         xref_ontology, xref_id = re.split('[_:]', x)
         xrefs.setdefault(xref_ontology, set()).add('{}:{}'.format(xref_ontology, xref_id))
 
diff --git a/bin/traits_to_zooma_format.py b/bin/traits_to_zooma_format.py
index ae4dffff..8b94765a 100644
--- a/bin/traits_to_zooma_format.py
+++ b/bin/traits_to_zooma_format.py
@@ -5,28 +5,7 @@
 from time import gmtime, strftime
 
 from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
-
-
-class OntologyUri:
-    # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
-    # some examples of how this looks like in ClinVar data.
-    db_to_uri_conversion = {
-        'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}',  # <XRef ID="1756" DB="Orphanet"/>
-        'omim': lambda x: f'https://www.omim.org/entry/{x}',  # <XRef Type="MIM" ID="612773" DB="OMIM"/>
-        'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}',  # <XRef ID="EFO_0005137" DB="EFO"/>
-        'mesh': lambda x: f'http://identifiers.org/mesh/{x}',  # <XRef ID="D065630" DB="MeSH"/>
-        'medgen': lambda x: f'http://identifiers.org/medgen/{x}',  # <XRef ID="C0235833" DB="MedGen"/>
-        # <XRef ID="MONDO:0013353" DB="MONDO"/>
-        'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
-    }
-
-    def __init__(self, id_, db):
-        self.id_ = id_
-        self.db = db
-        self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)
-
-    def __str__(self):
-        return self.uri
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
 
 
 def write_zooma_record(clinvar_acc, variant_id, trait_name, ontology_uri, date, outfile):
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
index 5cb29a04..1fdca089 100644
--- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_measure.py
@@ -85,6 +85,16 @@ def nsv_id(self):
             logger.warning(f'Found multiple NSV IDs for {self.clinvar_record}, this is not yet supported')
             return None
 
+    @property
+    def existing_so_terms(self):
+        all_so_terms = set()
+        for attr_elem in find_elements(self.measure_xml, './AttributeSet'):
+            if find_elements(attr_elem, './Attribute[@Type="MolecularConsequence"]'):
+                for so_elem in find_elements(attr_elem, './XRef[@DB="Sequence Ontology"]'):
+                    if 'providedBy' not in so_elem.attrib:
+                        all_so_terms.add(so_elem.attrib['ID'])
+        return all_so_terms
+
     @cached_property
     def _hgvs_to_types(self):
         """
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py
index ed89e36d..60a0229e 100644
--- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/clinvar_trait.py
@@ -17,6 +17,9 @@ class ClinVarTrait:
         'reclassified - variant of unknown significance', 'see cases', 'variant of unknown significance'
     }
 
+    # These database identifiers are directly importable into EFO
+    EFO_ALIGNED_ONTOLOGIES = {'Human Phenotype Ontology', 'EFO', 'Orphanet', 'MONDO'}
+
     def __init__(self, trait_xml, clinvar_record):
         self.trait_xml = trait_xml
         self.clinvar_record = clinvar_record
@@ -83,3 +86,9 @@ def medgen_id(self):
         else:
             logger.warning(f'Multiple MedGen IDs for {self}: {medgen_ids}')
             return medgen_ids[0]
+
+    @property
+    def current_efo_aligned_xrefs(self):
+        """Returns current EFO or EFO-aligned ids."""
+        return [(db, iden, status) for (db, iden, status) in self.xrefs
+                if status == 'current' and db in self.EFO_ALIGNED_ONTOLOGIES]
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py
new file mode 100644
index 00000000..6b7f162f
--- /dev/null
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/ontology_uri.py
@@ -0,0 +1,66 @@
+import logging
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class OntologyUri:
+    # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
+    # some examples of how this looks like in ClinVar data.
+    db_to_uri_conversion = {
+        'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}',  # <XRef ID="1756" DB="Orphanet"/>
+        'omim': lambda x: f'https://www.omim.org/entry/{x}',  # <XRef Type="MIM" ID="612773" DB="OMIM"/>
+        'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}',  # <XRef ID="EFO_0005137" DB="EFO"/>
+        'mesh': lambda x: f'http://identifiers.org/mesh/{x}',  # <XRef ID="D065630" DB="MeSH"/>
+        'medgen': lambda x: f'http://identifiers.org/medgen/{x}',  # <XRef ID="C0235833" DB="MedGen"/>
+        # <XRef ID="MONDO:0013353" DB="MONDO"/>
+        'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
+        # <XRef ID="HP:0011147" DB="Human Phenotype Ontology"/>
+        'hp': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
+    }
+
+    def __init__(self, id_, db):
+        self.id_ = id_
+        self.db = db if db.lower() != 'human phenotype ontology' else 'HP'
+        self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)
+
+    def __str__(self):
+        return self.uri
+
+    @property
+    def curie(self):
+        return self.uri_to_curie(self.uri)
+
+    @staticmethod
+    def uri_to_curie(uri):
+        """Convert an ontology uri to a DB:ID format."""
+        uri_db_to_curie_db = {
+            "ordo": "Orphanet",
+            "orphanet": "Orphanet",
+            "omim": "OMIM",
+            "efo": "EFO",
+            "hp": "HP",
+            "mondo": "MONDO",
+        }
+        if not any(x in uri.lower() for x in uri_db_to_curie_db.keys()):
+            return None
+        uri = uri.rstrip("/")
+        uri_list = uri.split("/")
+        if "identifiers.org" in uri:
+            db = uri_list[-2]
+            id_ = uri_list[-1]
+        elif "omim.org" in uri:
+            db = "OMIM"
+            id_ = uri_list[-1]
+        else:
+            last_component = uri_list[-1]
+            if ":" in last_component:
+                return last_component
+            elif "_" in last_component:
+                db, id_ = last_component.split("_")
+            else:
+                logger.warning(f"Could not convert URI to CURIE: {uri}")
+                return None
+        db = uri_db_to_curie_db[db.lower()]
+        return "{}:{}".format(db, id_)
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py
index 97f3c93c..9ece88fa 100644
--- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/repeat_variant.py
@@ -106,7 +106,7 @@ def parse_all_identifiers(clinvar_measure: ClinVarRecordMeasure):
     Repeat type must be present if possible, transcript id is useful if present."""
     # TODO should we prioritise these in some way?
     all_names = [clinvar_measure.get_variant_name_or_hgvs()] +\
-                [hgvs.text for hgvs in clinvar_measure.current_hgvs] +\
+                sorted([hgvs.text for hgvs in clinvar_measure.current_hgvs]) +\
                 clinvar_measure.all_names
     for name in all_names:
         repeat_type, transcript_id = infer_repeat_type_and_transcript_id(name)
diff --git a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py
index a57251a4..3a7ddf3b 100644
--- a/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py
+++ b/eva_cttv_pipeline/clinvar_xml_io/clinvar_xml_io/xml_parsing.py
@@ -14,6 +14,7 @@ def parse_header_attributes(clinvar_xml):
             if elem.tag == 'ReleaseSet':
                 attrib = elem.attrib
                 break
+            elem.clear()
     if not attrib:
         return {}
     # Resolve xsi:noNamespaceSchemaLocation="...", which is parsed strangely by ElementTree
diff --git a/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py b/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py
new file mode 100644
index 00000000..9175fdc1
--- /dev/null
+++ b/eva_cttv_pipeline/clinvar_xml_io/tests/test_ontology_uri.py
@@ -0,0 +1,11 @@
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
+
+
+def test_uri_to_curie():
+    assert OntologyUri.uri_to_curie('http://www.orpha.net/ORDO/Orphanet_713') == 'Orphanet:713'
+    assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/HP_0002930') == 'HP:0002930'
+    assert OntologyUri.uri_to_curie('https://omim.org/entry/300377') == 'OMIM:300377'
+    # If for some reason we're already passed a CURIE (probably a mistake in OLS response), return it as-is
+    assert OntologyUri.uri_to_curie('HP:0002505') == 'HP:0002505'
+    # Not a supported db
+    assert OntologyUri.uri_to_curie('http://purl.obolibrary.org/obo/DOID_10652') == None
diff --git a/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py b/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py
index 5c374250..fd9833aa 100644
--- a/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py
+++ b/eva_cttv_pipeline/evidence_string_generation/annotated_clinvar.py
@@ -1,4 +1,8 @@
+import re
 import xml.etree.ElementTree as ET
+from collections import defaultdict
+
+from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.ontology_uri import OntologyUri
 
 from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io import ClinVarTrait, ClinVarRecordMeasure, ClinVarDataset, \
     ClinVarRecord
@@ -6,7 +10,7 @@
 from eva_cttv_pipeline.evidence_string_generation.clinvar_to_evidence_strings import load_efo_mapping, \
     get_consequence_types
 from eva_cttv_pipeline.evidence_string_generation import consequence_type as CT
-
+from eva_cttv_pipeline.evidence_string_generation.set_metrics import SetComparisonMetrics
 
 PROCESSOR = 'CMAT'
 
@@ -15,29 +19,110 @@ class AnnotatingClinVarDataset(ClinVarDataset):
     """This class provides the ability to parse ClinVar records (RCVs) and annotate them with EFO mappings and
     consequence mappings on the fly."""
 
-    def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings):
+    def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings,
+                 eval_gene_mappings=None, eval_xref_mappings=None):
         super().__init__(clinvar_xml)
         self.header_attr['ProcessedBy'] = PROCESSOR
         self.string_to_efo_mappings = string_to_efo_mappings
         self.variant_to_gene_mappings = variant_to_gene_mappings
+        self.eval_gene_mappings = eval_gene_mappings
+        self.eval_xref_mappings = eval_xref_mappings
+        self.overall_counts = {}
+        self.gene_metrics = None
+        self.conseq_metrics = None
+        self.trait_metrics = None
 
     def __iter__(self):
+        # Initialise counts
+        self.overall_counts = {
+            'total': 0,
+            'has_supported_measure': 0,
+            'has_supported_trait': 0,
+        }
+        self.gene_metrics = SetComparisonMetrics()
+        self.conseq_metrics = SetComparisonMetrics()
+        self.trait_metrics = SetComparisonMetrics()
+
         for rcv in iterate_rcv_from_xml(self.clinvar_xml):
             record = AnnotatedClinVarRecord(rcv)
             self.annotate(record)
             yield record
 
+        # Finalise counts - computes averages, etc.
+        self.gene_metrics.finalise()
+        self.conseq_metrics.finalise()
+        self.trait_metrics.finalise()
+
     def annotate(self, record):
+        self.overall_counts['total'] += 1
         # Functional consequences for measure
         if record.measure:
-            consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings)
-            record.measure.add_ensembl_annotations(consequence_types)
-        # EFO terms for trait
+            self.overall_counts['has_supported_measure'] += 1
+            self.annotate_and_count_measure(record)
+        # EFO terms for traits
+        if record.traits_with_valid_names:
+            self.overall_counts['has_supported_trait'] += 1
+            self.annotate_and_count_traits(record)
+
+    def annotate_and_count_measure(self, record):
+        consequence_types = get_consequence_types(record.measure, self.variant_to_gene_mappings)
+        record.measure.add_ensembl_annotations(consequence_types)
+
+        annotated_genes = {ct.ensembl_gene_id for ct in consequence_types if ct.ensembl_gene_id}
+        annotated_conseqs = {EnsemblAnnotatedClinVarMeasure.format_so_term(ct.so_term)
+                             for ct in consequence_types if ct.so_term}
+
+        if self.eval_gene_mappings:
+            existing_ensembl_ids = self.eval_gene_mappings.get(record.accession, [])
+            self.gene_metrics.count_and_score(cv_set=existing_ensembl_ids, cmat_set=annotated_genes)
+            self.conseq_metrics.count_and_score(cv_set=record.measure.existing_so_terms, cmat_set=annotated_conseqs)
+
+    def annotate_and_count_traits(self, record):
         for trait in record.traits_with_valid_names:
+            existing_efo_ids = set()
+            annotated_efo_ids = set()
             efo_ids = []
             for trait_name in trait.all_names:
-                efo_ids.extend(efo_id for efo_id, efo_label in self.string_to_efo_mappings.get(trait_name.lower(), []))
+                efo_ids.extend(
+                    EfoMappedClinVarTrait.format_efo_id(efo_id)
+                    for efo_id, efo_label in self.string_to_efo_mappings.get(trait_name.lower(), []))
+
+            for db, iden, _ in trait.current_efo_aligned_xrefs:
+                curie = OntologyUri(iden, db).curie
+                if curie:
+                    existing_efo_ids.add(curie)
             trait.add_efo_mappings(efo_ids)
+            if self.eval_xref_mappings:
+                for efo_id in efo_ids:
+                    # Attempt to match to an ID in ClinVar based on synonyms - if our ID is in the list of synonyms for
+                    # a ClinVar ID, we use the synonymous ClinVar ID for comparison.
+                    for cv_id in existing_efo_ids:
+                        if efo_id in self.eval_xref_mappings.get(cv_id, {}):
+                            annotated_efo_ids.add(cv_id)
+                    # If didn't find anything, just use our ID, which will count as not matching.
+                    if not annotated_efo_ids:
+                        annotated_efo_ids.add(efo_id)
+
+                self.trait_metrics.count_and_score(cv_set=existing_efo_ids, cmat_set=annotated_efo_ids)
+
+    def report(self):
+        print('\nOverall counts (RCVs):')
+        self.print_counter(self.overall_counts)
+        if self.eval_gene_mappings:
+            print('\nGene annotations:')
+            self.gene_metrics.report()
+            print('\nFunctional consequences:')
+            self.conseq_metrics.report()
+        if self.eval_xref_mappings:
+            print('\nTrait mappings:')
+            self.trait_metrics.report()
+        print()
+
+    @staticmethod
+    def print_counter(counter):
+        max_len = len(max(counter.keys(), key=lambda x: len(x)))
+        for k, v in counter.items():
+            print(f'{k: <{max_len}} {v}')
 
 
 class AnnotatedClinVarRecord(ClinVarRecord):
@@ -51,11 +136,18 @@ class EfoMappedClinVarTrait(ClinVarTrait):
     def add_efo_mappings(self, efo_ids):
         efo_elts = []
         for efo_id in efo_ids:
-            if efo_id.startswith('http'):
-                efo_id = efo_id.split('/')[-1].replace('_', ':')
-            efo_elts.append(ET.Element('XRef', attrib={'ID': efo_id, 'DB': 'EFO', 'providedBy': PROCESSOR}))
+            efo_id = self.format_efo_id(efo_id)
+            # Include Status attribute so this isn't included among current xrefs
+            efo_elts.append(ET.Element('XRef', attrib={
+                'ID': efo_id, 'DB': 'EFO', 'Status': 'annotated', 'providedBy': PROCESSOR}))
         self.trait_xml.extend(efo_elts)
 
+    @staticmethod
+    def format_efo_id(efo_id):
+        if efo_id.startswith('http'):
+            return efo_id.split('/')[-1].replace('_', ':')
+        return efo_id
+
 
 class EnsemblAnnotatedClinVarMeasure(ClinVarRecordMeasure):
 
@@ -65,19 +157,54 @@ def add_ensembl_annotations(self, consequences):
             attr_set_elt = ET.Element('AttributeSet')
             attribute_elt = ET.Element('Attribute', attrib={'Type': 'MolecularConsequence', 'providedBy': PROCESSOR})
             attribute_elt.text = consequence_attributes.so_term.so_name.replace('_', ' ')
-            so_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.so_term.accession.replace('_', ':'),
+            so_elt = ET.Element('XRef', attrib={'ID': self.format_so_term(consequence_attributes.so_term),
                                                 'DB': 'Sequence Ontology'})
             ensembl_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_gene_id, 'DB': 'Ensembl'})
             attr_set_elt.extend((attribute_elt, so_elt, ensembl_elt))
             consequence_elts.append(attr_set_elt)
         self.measure_xml.extend(consequence_elts)
 
+    @staticmethod
+    def format_so_term(so_term):
+        return so_term.accession.replace('_', ':')
+
 
-def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file):
+def load_evaluation_gene_mappings(input_path):
+    mapping = defaultdict(list)
+    with open(input_path) as input_file:
+        for line in input_file:
+            cols = line.strip().split('\t')
+            if len(cols) != 2:
+                continue
+            mapping[cols[0]].append(cols[1])
+    return mapping
+
+
+def load_evaluation_xref_mappings(input_path):
+    mapping = {}
+    with open(input_path) as input_file:
+        for line in input_file:
+            cols = line.strip().split('\t')
+            if len(cols) != 2:
+                continue
+            mapping[cols[0]] = re.sub(r"{|}|'", '', cols[1]).split(', ')
+    return mapping
+
+
+def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file,
+                                   eval_gene_file=None, eval_xref_file=None):
     """Generate an annotated XML file of ClinVar RCVs based on EFO mappings file and gene mapping file (as documented in
     clinvar_to_evidence_strings)."""
     string_to_efo_mappings = load_efo_mapping(efo_mapping_file)
     variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
-
-    dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings)
+    # Need both files to do an evaluation
+    if eval_gene_file and eval_xref_file:
+        eval_gene_mappings = load_evaluation_gene_mappings(eval_gene_file)
+        eval_xref_mappings = load_evaluation_xref_mappings(eval_xref_file)
+        dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings,
+                                           eval_gene_mappings=eval_gene_mappings,
+                                           eval_xref_mappings=eval_xref_mappings)
+    else:
+        dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings)
     dataset.write(output_xml_file)
+    dataset.report()
diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
index 3f23d07d..f104683e 100644
--- a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
+++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
@@ -11,6 +11,8 @@ def helpMessage() {
         --output_dir    Directory for output
         --schema        Open Targets JSON schema version (optional, will output XML if omitted)
         --clinvar       ClinVar XML file (optional, will download latest if omitted)
+        --mappings      Trait mappings file (optional, will use a default path if omitted)
+        --evaluate      Whether to run evaluation or not (default false)
     """
 }
 
@@ -18,6 +20,8 @@ params.help = null
 params.output_dir = null
 params.schema = null
 params.clinvar = null
+params.mappings = '${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv'
+params.evaluate = false
 
 if (params.help) {
     exit 0, helpMessage()
@@ -58,7 +62,14 @@ workflow {
 
     } else {
         // Annotated ClinVar XML output
-        generateAnnotatedXml(clinvarXml, combineConsequences.out.consequencesCombined)
+        if (params.evaluate) {
+            evalGeneMapping = mapGenes(clinvarXml)
+            evalXrefMapping = mapXrefs(clinvarXml)
+        } else {
+            evalGeneMapping = null
+            evalXrefMapping = null
+        }
+        generateAnnotatedXml(clinvarXml, combineConsequences.out.consequencesCombined, evalGeneMapping, evalXrefMapping)
     }
 }
 
@@ -201,6 +212,42 @@ process combineConsequences {
     """
 }
 
+/*
+ * Map existing genes to Ensembl gene IDs. Currently used only for evaluation.
+ */
+process mapGenes {
+    input:
+    path clinvarXml
+
+    output:
+    path "output_gene_mappings.tsv", emit: outputGeneMappings
+
+    script:
+    """
+    \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_genes.py \
+        --clinvar-xml ${clinvarXml} \
+        --output-file output_gene_mappings.tsv
+    """
+}
+
+/*
+ * Map existing crossrefs to EFO-aligned IDs. Currently used only for evaluation.
+ */
+process mapXrefs {
+    input:
+    path clinvarXml
+
+    output:
+    path "output_xref_mappings.tsv", emit: outputXrefMappings
+
+    script:
+    """
+    \${PYTHON_BIN} \${CODE_ROOT}/bin/evaluation/map_xrefs.py \
+        --clinvar-xml ${clinvarXml} \
+        --output-file output_xref_mappings.tsv
+    """
+}
+
 /*
  * Generate annotated ClinVar XML
  */
@@ -216,16 +263,21 @@ process generateAnnotatedXml {
     input:
     path clinvarXml
     path consequenceMappings
+    path evalGeneMapping
+    path evalXrefMapping
 
     output:
     path "annotated_clinvar.xml.gz"
 
     script:
+    def evalGeneFlag = evalGeneMapping != null? "--eval-gene-file ${evalGeneMapping}" : ""
+    def evalXrefFlag = evalXrefMapping != null? "--eval-xref-file ${evalXrefMapping}" : ""
     """
     \${PYTHON_BIN} \${CODE_ROOT}/bin/generate_annotated_xml.py \
         --clinvar-xml ${clinvarXml} \
-        --efo-mapping \${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \
+        --efo-mapping ${params.mappings} \
         --gene-mapping ${consequenceMappings} \
+        ${evalGeneFlag} ${evalXrefFlag} \
         --output-xml annotated_clinvar.xml.gz
     """
 }
diff --git a/eva_cttv_pipeline/evidence_string_generation/set_metrics.py b/eva_cttv_pipeline/evidence_string_generation/set_metrics.py
new file mode 100644
index 00000000..384c21bb
--- /dev/null
+++ b/eva_cttv_pipeline/evidence_string_generation/set_metrics.py
@@ -0,0 +1,91 @@
+class SetComparisonMetrics:
+    """Records counts and scores for comparing sets of arbitrary values from two sources (ClinVar and CMAT)."""
+
+    both_present_keys = [
+        'exact_match',
+        'cmat_superset',
+        'cmat_subset',
+        'divergent_match',
+        'mismatch'
+    ]
+    some_missing_keys = [
+        'cv_missing',
+        'cmat_missing',
+        'both_missing'
+    ]
+    all_keys = both_present_keys + some_missing_keys
+
+    def __init__(self):
+        self.counts = {k: 0 for k in self.all_keys}
+        self.scores = {k: 0 for k in self.all_keys}
+        # To keep counts in self.counts disjoint, add a separate field (calculated last) for both_present
+        self.both_present_count = 0
+        self.both_present_score = 0
+
+    def count_and_score(self, cv_set, cmat_set):
+        cv_set = set(cv_set)
+        cmat_set = set(cmat_set)
+
+        # First check if either set is empty - if so we count these but don't compute f-scores, as we do not assume
+        # either ClinVar or CMAT has complete coverage of the data.
+        if not cv_set and cmat_set:
+            self.counts['cv_missing'] += 1
+        elif cv_set and not cmat_set:
+            self.counts['cmat_missing'] += 1
+        elif not cv_set and not cmat_set:
+            self.counts['both_missing'] += 1
+
+        # Both sets present, compute the score and place in correct category.
+        else:
+            f1_score, true_pos, false_pos, false_neg = self.get_f1_tp_fp_fn(cv_set, cmat_set)
+            if false_pos and not false_neg:
+                k = 'cmat_superset'
+            elif not false_pos and false_neg:
+                k = 'cmat_subset'
+            elif not false_pos and not false_neg:
+                k = 'exact_match'
+            elif true_pos:
+                k = 'divergent_match'
+            else:
+                k = 'mismatch'
+            self.counts[k] += 1
+            self.scores[k] += f1_score
+
+    def finalise(self):
+        """Averages scores over counts, and also calculate both_present count & score."""
+        self.both_present_count = sum(self.counts[k] for k in self.both_present_keys)
+        if self.both_present_count:
+            self.both_present_score = sum(self.scores[k] for k in self.both_present_keys) / self.both_present_count
+        for k in self.all_keys:
+            self.scores[k] = self.scores[k] / self.counts[k] if self.counts[k] else 0
+
+    def report(self):
+        total = sum(v for v in self.counts.values())
+        print(f'Total = {total}')
+        self.pretty_print(
+            ('Category', 'Count', 'Percent', 'F1 Score'),
+            [(k, self.counts[k], f'{self.counts[k] / total:.1%}', f'{self.scores[k]:.2f}') for k in self.both_present_keys]
+            + [('=> both_present', self.both_present_count, f'{self.both_present_count / total:.1%}', f'{self.both_present_score:.2f}')]
+            + [(k, self.counts[k], f'{self.counts[k] / total:.1%}', f'{self.scores[k]:.2f}') for k in self.some_missing_keys]
+        )
+
+    @staticmethod
+    def get_f1_tp_fp_fn(cv_set, cmat_set):
+        """Returns f1-score and number of true positives, false positives and false negatives."""
+        if len(cv_set) == 0 and len(cmat_set) == 0:
+            return 0, 0, 0, 0
+        tp = len(cmat_set & cv_set)
+        fp = len(cmat_set - cv_set)
+        fn = len(cv_set - cmat_set)
+        return 2 * tp / (2 * tp + fp + fn), tp, fp, fn
+
+    @staticmethod
+    def pretty_print(header, table):
+        cell_widths = [len(h) for h in header]
+        for row in table:
+            for i, cell in enumerate(row):
+                cell_widths[i] = max(cell_widths[i], len(str(cell)))
+        format_string = '  '.join('{%s:>%s}' % (i, w) for i, w in enumerate(cell_widths))
+        print(' ' + format_string.format(*header) + ' ')
+        for row in table:
+            print(' ' + format_string.format(*row) + ' ')
diff --git a/setup.py b/setup.py
index cc8e9cb0..1a675e09 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ def get_requires():
 
 
 setup(name='eva_cttv_pipeline',
-      version='2.7.4',
+      version='2.8.0',
       packages=find_packages(),
       install_requires=get_requires(),
       #! TBD: list as a dependency subpackage 'clinvar_xml_utils.clinvar_xml_utils.clinvar_xml_utils'
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz
index e298e742..f793609a 100644
Binary files a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz and b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_annotation_output.xml.gz differ
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py
index a5a80cab..1a7f0498 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar.py
@@ -40,6 +40,9 @@ def test_allele_origins(self):
     def test_valid_allele_origins(self):
         assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'}
 
+    def test_trait_efo_ids(self):
+        assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')]
+
 
 class TestClinvarRecordMeasure:
     @classmethod
@@ -71,3 +74,6 @@ def test_variant_type(self):
 
     def test_measure_set_pubmed_refs(self):
         assert self.test_crm.pubmed_refs == []
+
+    def test_so_terms(self):
+        assert self.test_crm.existing_so_terms == {'SO:0001583'}
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py
new file mode 100644
index 00000000..fab0b88f
--- /dev/null
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_set_metrics.py
@@ -0,0 +1,39 @@
+from eva_cttv_pipeline.evidence_string_generation.set_metrics import SetComparisonMetrics
+
+
+# Tolerance for float comparisons
+EPSILON = 1e-6
+
+
+def test_f1_score():
+    assert abs(SetComparisonMetrics.get_f1_tp_fp_fn({1, 2}, {1})[0] - 2 / 3) < EPSILON
+    assert abs(SetComparisonMetrics.get_f1_tp_fp_fn({1, 2}, {2, 1})[0] - 1) < EPSILON
+    assert SetComparisonMetrics.get_f1_tp_fp_fn({}, {})[0] == 0
+
+
+def test_count_and_score():
+    metrics = SetComparisonMetrics()
+    metrics.count_and_score({}, {})
+    metrics.count_and_score({1, 2}, {1})
+    metrics.count_and_score({1, 2}, {2, 1})
+    metrics.count_and_score({1, 2}, {2, 3})
+    metrics.count_and_score({1, 2}, {2, 3, 4})
+    metrics.count_and_score({1}, {2})
+    metrics.count_and_score([], {2, 3})
+    metrics.finalise()
+
+    assert metrics.counts == {
+        'exact_match': 1,
+        'cmat_superset': 0,
+        'cmat_subset': 1,
+        'divergent_match': 2,
+        'mismatch': 1,
+        'cv_missing': 1,
+        'cmat_missing': 0,
+        'both_missing': 1,
+    }
+
+    assert metrics.scores['exact_match'] == 1
+    assert metrics.scores['mismatch'] == 0
+    # mean(2/(2+1+1), 2/(2+2+1)) = mean(0.5, 0.4) = 0.45
+    assert abs(metrics.scores['divergent_match'] - 0.45) < EPSILON