diff --git a/bin/trait_mapping/create_table_for_manual_curation.py b/bin/trait_mapping/create_table_for_manual_curation.py index 877c26f1..eccab88e 100755 --- a/bin/trait_mapping/create_table_for_manual_curation.py +++ b/bin/trait_mapping/create_table_for_manual_curation.py @@ -5,22 +5,28 @@ import pandas as pd from cmat.trait_mapping.ols import ( - get_ontology_label_from_ols, is_current_and_in_efo, is_in_efo, + get_ontology_label_from_ols, is_current_and_in_efo, is_in_efo, get_replacement_term, ) -def find_previous_mapping(trait_name, previous_mappings): +def find_previous_mapping_and_replacement(trait_name, previous_mappings): if trait_name not in previous_mappings: - return '' + return '', '' uri = previous_mappings[trait_name] - label = get_ontology_label_from_ols(uri) - uri_is_current_and_in_efo = is_current_and_in_efo(uri) - uri_in_efo = is_in_efo(uri) - if uri_in_efo: - trait_status = 'EFO_CURRENT' if uri_is_current_and_in_efo else 'EFO_OBSOLETE' - else: - trait_status = 'NOT_CONTAINED' + label = get_ontology_label(uri) + trait_status = get_trait_status(uri) trait_string = '|'.join([uri, label, 'NOT_SPECIFIED', 'previously-used', trait_status]) + replacement_string = find_replacement_mapping(uri) + return trait_string, replacement_string + + +def find_replacement_mapping(previous_uri): + replacement_uri = get_replacement_term(previous_uri) + if not replacement_uri: + return '' + label = get_ontology_label(replacement_uri) + trait_status = get_trait_status(replacement_uri) + trait_string = '|'.join([replacement_uri, label, 'NOT_SPECIFIED', 'replacement', trait_status]) return trait_string @@ -31,6 +37,21 @@ def find_exact_mapping(trait_name, mappings): return '' +def get_ontology_label(uri): + label = get_ontology_label_from_ols(uri) + return label if label is not None else '' + + +def get_trait_status(uri): + uri_is_current_and_in_efo = is_current_and_in_efo(uri) + uri_in_efo = is_in_efo(uri) + if uri_in_efo: + trait_status = 'EFO_CURRENT' if uri_is_current_and_in_efo else 'EFO_OBSOLETE' + else: + trait_status = 'NOT_CONTAINED' + return trait_status + + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( @@ -66,9 +87,9 @@ def find_exact_mapping(trait_name, mappings): notes = f'"{notes}\n{previous_comments[trait_name]}"' # Use maximum of 50 mappings to improve Google Sheets performance mappings = fields[3:53] - previous_mapping = find_previous_mapping(trait_name, previous_mappings) + previous_mapping, replacement_mapping = find_previous_mapping_and_replacement(trait_name, previous_mappings) exact_mapping = find_exact_mapping(trait_name, mappings) - rows.append([trait_name, trait_freq, notes, previous_mapping, exact_mapping] + mappings) + rows.append([trait_name, trait_freq, notes, previous_mapping, exact_mapping, replacement_mapping] + mappings) rows.sort(key=lambda x: (x[2], int(x[1])), reverse=True) with open(args.output, 'w') as outfile: diff --git a/cmat/trait_mapping/ols.py b/cmat/trait_mapping/ols.py index 76d1d628..a07ba0de 100644 --- a/cmat/trait_mapping/ols.py +++ b/cmat/trait_mapping/ols.py @@ -3,8 +3,9 @@ import requests import urllib -from cmat.trait_mapping.utils import json_request +from retry import retry +from cmat.trait_mapping.utils import json_request, ServerError OLS_EFO_SERVER = 'https://www.ebi.ac.uk/ols' # The setting for local OLS installation should be uncommented if necessary. Note that the link @@ -59,6 +60,7 @@ def double_encode_uri(uri: str) -> str: return urllib.parse.quote(urllib.parse.quote(uri, safe=""), safe="") +@retry(exceptions=(ConnectionError, ServerError), logger=logger, tries=8, delay=2, backoff=1.2, jitter=(1, 3)) def ols_efo_query(uri: str) -> requests.Response: """ Query EFO using OLS for a given ontology uri, returning the response from the request. @@ -67,8 +69,11 @@ def ols_efo_query(uri: str) -> requests.Response: :return: Response from OLS """ double_encoded_uri = double_encode_uri(uri) - return requests.get( + response = requests.get( "{}/api/ontologies/efo/terms/{}".format(OLS_EFO_SERVER, double_encoded_uri)) + if 500 <= response.status_code < 600: + raise ServerError + return response @lru_cache(maxsize=16384) @@ -92,7 +97,24 @@ def is_in_efo(uri: str) -> bool: Checks whether given ontology uri is a valid term in EFO. :param uri: Ontology uri to use in querying EFO using OLS - :return: Boolean value, true if ontology uri is valid and non-obsolete term in EFO + :return: Boolean value, true if ontology uri is valid term in EFO """ response = ols_efo_query(uri) return response.status_code == 200 + + +@lru_cache(maxsize=16384) +def get_replacement_term(uri: str) -> str: + """ + Finds replacement term in EFO (if present) for the given ontology uri. + + :param uri: Ontology uri to use in querying EFO using OLS + :return: Replacement term URI or empty string if not obsolete + """ + response = ols_efo_query(uri) + if response.status_code != 200: + return "" + response_json = response.json() + if response_json["term_replaced_by"] is not None: + return response_json["term_replaced_by"] + return "" diff --git a/cmat/trait_mapping/utils.py b/cmat/trait_mapping/utils.py index fe536ab5..1c0bfe13 100644 --- a/cmat/trait_mapping/utils.py +++ b/cmat/trait_mapping/utils.py @@ -1,9 +1,15 @@ import logging import requests +from requests import HTTPError from retry import retry + logger = logging.getLogger(__package__) +class ServerError(HTTPError): + """A server-side error occurred.""" + + @retry(exceptions=(ConnectionError, requests.RequestException), logger=logger, tries=8, delay=2, backoff=1.2, jitter=(1, 3)) def json_request(url: str, payload: dict = None, method=requests.get) -> dict: diff --git a/docs/manual-curation/step2-manual-curation.md b/docs/manual-curation/step2-manual-curation.md index 94f7d17b..a2ef73d3 100644 --- a/docs/manual-curation/step2-manual-curation.md +++ b/docs/manual-curation/step2-manual-curation.md @@ -3,7 +3,7 @@ The goals of the manual curation: * All traits which are linked to NT expansion (nucleotide repeat expansion) variants must be curated. Those are marked as "NT expansion" in the “Notes” column. * All traits with occurrence ≥ **10** must be curated. Additionally, if there are less than **200** such terms, then the top 200 terms must be curated. -* _Suggested previous mapping_ traits should be checked for any terms that have become obsolete since the last iteration. This can be done by filtering then searching for the string EFO\_OBSOLETE +* _Suggested previous mapping_ traits should be checked for any terms that have become obsolete since the last iteration. These will be colored red and likely have a _suggested replacement mapping_ provided in the appropriate column. If no replacement is provided, curate as usual. * For the rest of the traits, we curate as many as possible. Good mappings must be eyeballed to ensure they are actually good. Alternative mappings for medium or low quality mappings can be searched for using OLS. If a mapping can't be found in EFO, look for a mapping to a HP, ORDO, or MONDO trait name. Most HP/ORDO/MONDO terms will also be in EFO but some are not. These can be imported to EFO using the Webulous submission service. @@ -27,8 +27,8 @@ Curation should be done by subsequently applying filters to appropriate columns, * 1\. **There is a previously assigned mapping for this trait.** All of these are the decisions that we made in the past, so we trust them (to an extent). Copy and paste previously used mappings into “Mapping to use”. Then review them according to the following steps. * 1.1. **The previously assigned mapping is in EFO** - * 1.1.1. **The previously assigned mapping is in EFO and is exact.** Mark as finished immediately. (It's extremely unlikely that a better mapping could be found). - * 1.1.2. **The previously assigned mapping is in EFO and IS NOT exact.** Review the mappings to see if a better (more accurate/specific) mapping is available. Then mark as finished. + * 1.1.1. **The previously assigned mapping is in EFO and is exact and current.** Mark as finished immediately. (It's extremely unlikely that a better mapping could be found). + * 1.1.2. **The previously assigned mapping is in EFO and IS NOT exact or current.** Review the mappings to see if a better (more accurate/specific, non-obsolete) mapping is available. Then mark as finished. * 1.2. **The previously assigned mapping is not contained in EFO.** We need to either find a mapping which is already in EFO, or import these terms into EFO. * 1.2.1. **The previously used mapping IS NOT contained in EFO and is exact.** These are good candidates to mark as finished and them import in EFO afterwards. However, quickly check whether there are non-exact matches which are already in EFO are are as good as exact mappings. * E. g. if the exact mapping is “erythrocytosis 6, familial” and not in EFO, but there is an inexact mapping “familial erythrocytosis 6” which *is* in EFO, we should use the inexact mapping.