EBIvariation · tskir · Mar 15, 2021 · Mar 8, 2021 · Mar 8, 2021 · Mar 8, 2021
diff --git a/bin/traits_to_zooma_format.py b/bin/traits_to_zooma_format.py
@@ -8,19 +8,22 @@
 
 
 class OntologyUri:
-    db_to_uri_dict = {
-        'orphanet': 'http://www.orpha.net/ORDO/Orphanet_{}',
-        'omim':     'http://identifiers.org/omim/{}',
-        'efo':      'http://www.ebi.ac.uk/efo/{}',
-        'mesh':     'http://identifiers.org/mesh/{}',
-        'medgen':   'http://identifiers.org/medgen/{}',
-        'mondo':    'http://purl.obolibrary.org/obo/MONDO_{}',
+    # ClinVar stores cross-references in very different formats. This provides their conversion to full IRIs, along with
+    # some examples of how this looks like in ClinVar data.
+    db_to_uri_conversion = {
+        'orphanet': lambda x: f'http://www.orpha.net/ORDO/Orphanet_{x}',  # <XRef ID="1756" DB="Orphanet"/>
+        'omim': lambda x: f'https://www.omim.org/entry/{x}',  # <XRef Type="MIM" ID="612773" DB="OMIM"/>
+        'efo': lambda x: f'http://www.ebi.ac.uk/efo/{x}',  # <XRef ID="EFO_0005137" DB="EFO"/>
+        'mesh': lambda x: f'http://identifiers.org/mesh/{x}',  # <XRef ID="D065630" DB="MeSH"/>
+        'medgen': lambda x: f'http://identifiers.org/medgen/{x}',  # <XRef ID="C0235833" DB="MedGen"/>
+        # <XRef ID="MONDO:0013353" DB="MONDO"/>
+        'mondo': lambda x: 'http://purl.obolibrary.org/obo/{}'.format(x.replace(':', '_')),
     }
 
     def __init__(self, id_, db):
         self.id_ = id_
         self.db = db
-        self.uri = self.db_to_uri_dict[self.db.lower()].format(self.id_)
+        self.uri = self.db_to_uri_conversion[self.db.lower()](self.id_)
 
     def __str__(self):
         return self.uri
@@ -49,7 +52,7 @@ def process_clinvar_record(clinvar_record, outfile):
         if (trait.name is None) or (trait.name.lower() == 'not provided'):
             continue
         for db, identifier, status in trait.xrefs:
-            if status != 'current' or db.lower() not in OntologyUri.db_to_uri_dict:
+            if status != 'current' or db.lower() not in OntologyUri.db_to_uri_conversion:
                 continue
             ontology_uri = OntologyUri(identifier, db)
             write_zooma_record(clinvar_record.accession, variant_id, trait.name, ontology_uri,

diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md
@@ -79,6 +79,10 @@ ${BSUB_CMDLINE} -K -M 10G \
     --ot-schema    ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \
     --out          ${BATCH_ROOT}/evidence_strings/
 
+# Check that the generated evidence strings do not contain any duplicates
+sort ${BATCH_ROOT}/evidence_strings/evidence_strings.json | uniq -c | awk '$1 > 1' > \
+  ${BATCH_ROOT}/evidence_strings/duplicates.json
+
 # Convert MedGen and OMIM cross-references into ZOOMA format.
 ${BSUB_CMDLINE} -K \
   -o ${BATCH_ROOT}/logs/traits_to_zooma_format.out \
@@ -90,6 +94,9 @@ ${BSUB_CMDLINE} -K \
 
 ## 3. Manual follow-up actions
 
+### Check that generated evidence strings do not contain any duplicates
+The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the file `${BATCH_ROOT}/evidence_strings/duplicates.json` should be empty. Check that this is the case.
+
 ### Update summary metrics
 After the evidence strings have been generated, summary metrics need to be updated in the Google Sheets [table](https://docs.google.com/spreadsheets/d/1g_4tHNWP4VIikH7Jb0ui5aNr0PiFgvscZYOe69g191k/) on the “Raw statistics” sheet.
 
@@ -149,6 +156,7 @@ If everything has been done correctly, hash sums will be the same. Note that the
   + Evidence stringts
     - Version of JSON schema is the same as specified in the Open Targets e-mail
     - All traits mentioned in the [spreadsheet](https://docs.google.com/spreadsheets/d/1m4ld3y3Pfust5JSOJOX9ZmImRCKRGi-fGYj_dExoGj8/edit) are mapped to the correct ontology terms in `${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv`.
+    - The file `${BATCH_ROOT}/evidence_strings/duplicates.json` is empty, meaning there are no duplicates in the generated evidence strings.
 * Step 5 “Manual follow-up actions”
   + The summary metrics
     - Are present in the [spreadsheet](https://docs.google.com/spreadsheets/d/1g_4tHNWP4VIikH7Jb0ui5aNr0PiFgvscZYOe69g191k/)

diff --git a/docs/manual-curation/step2-manual-curation.md b/docs/manual-curation/step2-manual-curation.md
@@ -1,7 +1,7 @@
 # Manual curation, part II, biological: perform manual curation
 
 The goals of the manual curation:
-* All traits which are linked to NT expansion (nucleotide repeat expansion) variants must be curated. Those are marked as "NT expansion" in the frequency column.
+* All traits which are linked to NT expansion (nucleotide repeat expansion) variants must be curated. Those are marked as "NT expansion" in the “Notes” column.
 * All traits with occurrence ≥ **10** must be curated. Additionally, if there are less than **200** such terms, then the top 200 terms must be curated.
 * For the rest of the traits, we curate as many as possible.
 
@@ -61,5 +61,10 @@ The “Status” column has the following acceptable values:
 
 “Comment” field can contain arbitrary additional information.
 
+### Note on multiple mappings
+Sometimes it is necessary to map a single source string to two or more ontology terms to fully represent the concept. For example, “Coronary artery disease/myocardial infarction” should be mapped both to http://www.ebi.ac.uk/efo/EFO_0001645 “Coronary artery disease” and to http://www.ebi.ac.uk/efo/EFO_0000612 “Myocardial infarction”.
+
+To do this, **duplicate** the row containing the disease string, assign different mappings in each of the rows, and mark them both with an appropriate status. This will be handled downstream during export and evidence string generation.
+
 ### Note on spaces and line breaks
 Sometimes, especially when copy-pasting information from external sources, a mapping label or URL can contain an additional space symbol (at the beginning or end) or an accidental line break. This causes problems in the downstream processing and must be manually removed. To minimise the occurences of this, Google Sheets template includes a validation formula for the first two columns (“URI of selected mapping” and “Label of selected mapping”). If it detects an extra space symbol or a line break, the cell will be highlighted in red.
diff --git a/docs/manual-curation/step3-export-results.md b/docs/manual-curation/step3-export-results.md
@@ -28,17 +28,17 @@ export NEW_MAPPINGS=${CURATION_RELEASE_ROOT}/trait_names_to_ontology_mappings.ts
 cat \
   ${CURATION_RELEASE_ROOT}/automated_trait_mappings.tsv \
   ${CURATION_RELEASE_ROOT}/finished_mappings_curation.tsv \
-> ${NEW_MAPPINGS}
+| sort -u > ${NEW_MAPPINGS}
 
 # Add all mappings from the database which are *not* present in the results of the current curation iteration (automated
 # + manually curated). This is done in order to never lose mappings, even if they are not present in ClinVar during the
 # latest curation iteration.
 # The first file operand is the list of mappings in the current database; and the second is the list of trait names
 # which are only present in the existing database and not in the new mappings.
 export LC_ALL=C
-join -j 1 -t$'\t' \
-  <(sort -k1,1 ${EXISTING_MAPPINGS}) \
-  <(comm -23 <(cut -f1 ${EXISTING_MAPPINGS} | sort -u) <(cut -f1 ${NEW_MAPPINGS} | sort -u)) \
+join -j 1 -t $'\t' \
+  <(sort -t $'\t' -k 1,1 ${EXISTING_MAPPINGS}) \
+  <(comm -23 <(cut -d $'\t' -f 1 ${EXISTING_MAPPINGS} | sort -u) <(cut -d $'\t' -f 1 ${NEW_MAPPINGS} | sort -u)) \
 >> ${NEW_MAPPINGS}
 
 # Run the helper script to prepare the table for EFO import
@@ -61,6 +61,15 @@ ln -s -f ${NEW_MAPPINGS} ${EXISTING_MAPPINGS}
 ln -s -f ${CURATION_RELEASE_ROOT}/eva_clinvar.txt ${BATCH_ROOT_BASE}/manual_curation/eva_clinvar.txt
 ```
 
+## Check that the resulting file contains no duplicates
+The resulting list of text-to-ontology mappings should not contain any complete duplicates. Check that this is the case by using the following command. If everything is correct, it should not output anything:
+
+```bash
+sort ${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv | uniq -c | awk '$1 > 1'
+```
+
+If there are duplicates, resolve this by editing the `${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv` file directly.
+
 ## Copy the table for EFO import
 The file `${CURATION_RELEASE_ROOT}/efo_import_table.tsv` will contain a partially ready table for EFO import. Copy its contents into the “Add EFO disease” sheet in the curation spreadsheet.
 

diff --git a/eva_cttv_pipeline/clinvar_xml_utils.py b/eva_cttv_pipeline/clinvar_xml_utils.py
@@ -147,7 +147,10 @@ def traits(self):
         return self.trait_set
 
     @property
-    def observed_pubmed_refs(self):
+    def evidence_support_pubmed_refs(self):
+        """The references of this type represent evidence support for this specific variant being observed in this
+        specific disease. These are the references displayed on the ClinVar website in the "Assertion and evidence
+        details" section at the bottom of the page."""
         return [int(elem.text)
                 for elem in find_elements(self.rcv, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]
 
@@ -187,7 +190,8 @@ def name(self):
 
     @property
     def pubmed_refs(self):
-        """Trait-specific PubMed references, contained inside a Trait entity."""
+        """Trait-specific PubMed references, contained inside a Trait entity. These are usually reviews or practice
+        guidelines related to a disease or a group of diseases."""
         return [int(elem.text) for elem in find_elements(self.trait_xml, './Citation/ID[@Source="PubMed"]')]
 
     @property
@@ -307,7 +311,8 @@ def is_repeat_expansion_variant(self):
 
     @property
     def pubmed_refs(self):
-        """Variant-specific PubMed references, contained inside a Measure entity."""
+        """Variant-specific PubMed references, contained inside a Measure entity. These are usually large reviews which
+        focus on genetics of specific types of variants or genomic regions."""
         return [int(elem.text) for elem in find_elements(self.measure_xml, './Citation/ID[@Source="PubMed"]')]
 
     @property

diff --git a/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py b/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py
@@ -244,7 +244,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
 
         # Literature. ClinVar records provide three types of references: trait-specific; variant-specific; and
         # "observed in" references. Open Targets are interested only in that last category.
-        'literature': sorted(set([str(r) for r in clinvar_record.observed_pubmed_refs])),
+        'literature': sorted(set([str(r) for r in clinvar_record.evidence_support_pubmed_refs])),
 
         # RCV identifier.
         'studyId': clinvar_record.accession,
@@ -343,21 +343,16 @@ def load_efo_mapping(efo_mapping_file):
     trait_2_efo = defaultdict(list)
     n_efo_mappings = 0
 
-    with open(efo_mapping_file, "rt") as f:
+    with open(efo_mapping_file, 'rt') as f:
         for line in f:
             line = line.rstrip()
-            if line.startswith("#") or not line:
+            if line.startswith('#') or not line:
                 continue
-            line_list = line.split("\t")
-            clinvar_name = line_list[0].lower()
-            if len(line_list) > 1:
-                ontology_id_list = line_list[1].split("|")
-                ontology_label_list = line_list[2].split("|") if len(line_list) > 2 else [None] * len(ontology_id_list)
-                for ontology_id, ontology_label in zip(ontology_id_list, ontology_label_list):
-                    trait_2_efo[clinvar_name].append((ontology_id, ontology_label))
-                n_efo_mappings += 1
-            else:
-                raise ValueError('No mapping provided for trait: {}'.format(clinvar_name))
+            line_list = line.split('\t')
+            assert len(line_list) == 3, f'Incorrect string to EFO mapping format for line {line}'
+            clinvar_name, ontology_id, ontology_label = line_list
+            trait_2_efo[clinvar_name.lower()].append((ontology_id, ontology_label))
+            n_efo_mappings += 1
     logger.info('{} EFO mappings loaded'.format(n_efo_mappings))
     return trait_2_efo
 
@@ -416,6 +411,7 @@ def group_diseases_by_efo_mapping(clinvar_record_traits, string_to_efo_mappings,
         trait_name = trait.name.lower()
         if trait_name not in string_to_efo_mappings:  # Traits without an EFO mapping are skipped
             report.counters['n_missed_strings_unmapped_traits'] += 1
+            report.unmapped_traits[trait_name] += 1
             continue
         for efo_id, efo_label in string_to_efo_mappings[trait_name]:
             efo_to_traits[efo_id].append(trait)

diff --git a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
@@ -36,8 +36,7 @@ def process_consequence_type_file_tsv(snp_2_gene_filepath):
 
 def process_consequence_type_file(snp_2_gene_file):
     logger.info('Loading mapping rs -> ENSG/SOterms')
-    consequence_type_dict, one_rs_multiple_genes = \
-        process_consequence_type_file_tsv(snp_2_gene_file)
+    consequence_type_dict, one_rs_multiple_genes = process_consequence_type_file_tsv(snp_2_gene_file)
     logger.info('{} rs->ENSG/SOterms mappings loaded'.format(len(consequence_type_dict)))
     logger.info('{} rsIds with multiple gene associations'.format(len(one_rs_multiple_genes)))
     return consequence_type_dict

diff --git a/eva_cttv_pipeline/trait_mapping/main.py b/eva_cttv_pipeline/trait_mapping/main.py
@@ -1,4 +1,3 @@
-from collections import Counter
 import csv
 import logging
 import multiprocessing
@@ -13,6 +12,11 @@
 
 logger = logging.getLogger(__package__)
 
+# These ambiguous trait names cannot be resolved to a specific disease and must not be output
+# TODO: Also use this in the future refactor of the quality control system (see issue #114)
+NONSPECIFIC_TRAITS = {'disease', 'not provided', 'not specified', 'reclassified - variant of unknown significance',
+                      'see cases', 'variant of unknown significance'}
+
 
 def get_uris_for_oxo(zooma_result_list: list) -> set:
     """
@@ -91,6 +95,8 @@ def main(input_filepath, output_mappings_filepath, output_curation_filepath, fil
 
         logger.info('Writing output with the processed traits')
         for trait in processed_trait_list:
-            output_trait(trait, mapping_writer, curation_writer)
+            # Remove non-specific trait names which should never be output
+            if trait.name.lower() not in NONSPECIFIC_TRAITS:
+                output_trait(trait, mapping_writer, curation_writer)
 
     logger.info('Finished processing trait names')
diff --git a/eva_cttv_pipeline/trait_mapping/trait_names_parsing.py b/eva_cttv_pipeline/trait_mapping/trait_names_parsing.py
@@ -20,28 +20,28 @@ def parse_trait_names(filepath: str) -> list:
     :param filepath: Path to a gzipped file containing ClinVar XML dump.
     :return: A list of Trait objects."""
 
-    # Tracks unique (RCV, trait name) tuples
-    unique_association_tuples = set()
+    # Tracks how many times a trait name occurs in ClinVar
+    trait_name_counter = Counter()
 
     # Tracks all traits which are at least once implicated in "NT expansion", or nucleotide repeat expansion, variants.
     # Their curation is of highest importance regardless of how many records they are actually associated with.
     nt_expansion_traits = set()
 
     for clinvar_record in clinvar_xml_utils.ClinVarDataset(filepath):
-        traits = set(trait.name for trait in clinvar_record.traits if trait.name is not None)
-        unique_association_tuples |= {(clinvar_record.accession, trait) for trait in traits}
+        trait_names = set(trait.name.lower() for trait in clinvar_record.traits if trait.name is not None)
+        for trait_name in trait_names:
+            trait_name_counter[trait_name] += 1
         if clinvar_record.measure and clinvar_record.measure.is_repeat_expansion_variant:
-            nt_expansion_traits |= traits
+            nt_expansion_traits |= trait_names
 
     # Count trait occurrences
-    trait_names = [t[1] for t in unique_association_tuples]
     traits = []
-    for trait_name, trait_frequency in Counter(trait_names).items():
+    for trait_name, trait_frequency in trait_name_counter.items():
         if trait_name == '-':
             print('Skipped {} missing trait names'.format(trait_frequency))
             continue
         associated_with_nt_expansion = trait_name in nt_expansion_traits
-        traits.append(Trait(name=trait_name.lower(), frequency=trait_frequency,
+        traits.append(Trait(name=trait_name, frequency=trait_frequency,
                             associated_with_nt_expansion=associated_with_nt_expansion))
 
     return traits
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@ def get_requires():
 
 
 setup(name='eva_cttv_pipeline',
-      version='2.0.0',
+      version='2.0.1',
       packages=find_packages(),
       install_requires=get_requires(),
       package_data={

diff --git a/tests/evidence_string_generation/test_clinvar.py b/tests/evidence_string_generation/test_clinvar.py
@@ -30,7 +30,7 @@ def test_trait_pubmed_refs(self):
         self.assertEqual(self.test_clinvar_record.traits[0].pubmed_refs, [20301475, 20301590, 30285347])
 
     def test_observed_pubmed_refs(self):
-        self.assertEqual(self.test_clinvar_record.observed_pubmed_refs, [15258582, 15322982])
+        self.assertEqual(self.test_clinvar_record.evidence_support_pubmed_refs, [15258582, 15322982])
 
     def test_clinical_significance(self):
         self.assertEqual(self.test_clinvar_record.clinical_significance_list, ['likely pathogenic', 'pathogenic'])