From 2ac2223daf88c2148caa1f6672080f45e741bb07 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 12 Nov 2020 14:58:05 -0800 Subject: [PATCH] Rename ambiguity option to "any" and update logic and tests accordingly Replaces the "all" option for `is_date_ambiguous` with "any" and updates the help documentation, ambiguity boolean logic, and unit tests to clarify this hierarchical treatment of ambiguity. --- augur/filter.py | 4 ++-- augur/utils.py | 19 ++++++++++--------- tests/test_utils.py | 11 ++++++++--- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index edc0b3e8f..bb7f27dda 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -105,8 +105,8 @@ def register_arguments(parser): help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND") parser.add_argument('--include-where', nargs='+', help="Include samples with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be included), not AND. This rule is applied last and ensures any sequences matching these rules will be included.") - parser.add_argument('--exclude-ambiguous-dates-by', choices=['all', 'day', 'month', 'year'], - help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or all date fields') + parser.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], + help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").') parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.") parser.add_argument('--output', '-o', help="output file", required=True) diff --git a/augur/utils.py b/augur/utils.py index 2ac54319c..5fb1ed923 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -73,16 +73,16 @@ def ambiguous_date_to_date_range(uncertain_date, fmt, min_max_year=None): def read_metadata(fname, query=None): return MetadataFile(fname, query).read() -def is_date_ambiguous(date, ambiguous_by="all"): +def is_date_ambiguous(date, ambiguous_by="any"): """ - Returns whether a given date string in the format of YYYY-MM-DD is ambiguous by a given part of the date (e.g., day, month, year, or all parts). + Returns whether a given date string in the format of YYYY-MM-DD is ambiguous by a given part of the date (e.g., day, month, year, or any parts). Parameters ---------- date : str Date string in the format of YYYY-MM-DD ambiguous_by : str - Field of the date string to test for ambiguity ("day", "month", "year", "all") + Field of the date string to test for ambiguity ("day", "month", "year", "any") """ date_components = date.split('-', 2) @@ -96,12 +96,13 @@ def is_date_ambiguous(date, ambiguous_by="all"): month = "XX" day = "XX" - return ( - (ambiguous_by == 'all' and ('X' in year or 'X' in month or 'X' in day)) or - (ambiguous_by == 'day' and 'X' in day) or - (ambiguous_by == 'month' and 'X' in month) or - (ambiguous_by == 'year' and 'X' in year) - ) + # Determine ambiguity hierarchically such that, for example, an ambiguous + # month implicates an ambiguous day even when day information is available. + return any(( + "X" in year, + "X" in month and ambiguous_by in ("any", "month", "day"), + "X" in day and ambiguous_by in ("any", "day") + )) def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, min_max_year=None): if fmt: diff --git a/tests/test_utils.py b/tests/test_utils.py index 694a9105c..264bcc643 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -104,19 +104,24 @@ def test_read_mask_file_drm_file(self, tmpdir): def test_is_date_ambiguous(self): """is_date_ambiguous should return true for ambiguous dates and false for valid dates.""" # Test complete date strings with ambiguous values. - assert utils.is_date_ambiguous("2019-0X-0X", "all") + assert utils.is_date_ambiguous("2019-0X-0X", "any") assert utils.is_date_ambiguous("2019-XX-09", "month") assert utils.is_date_ambiguous("2019-03-XX", "day") assert utils.is_date_ambiguous("201X-03-09", "year") + assert utils.is_date_ambiguous("20XX-01-09", "month") + assert utils.is_date_ambiguous("2019-XX-03", "day") + assert utils.is_date_ambiguous("20XX-01-03", "day") # Test incomplete date strings with ambiguous values. - assert utils.is_date_ambiguous("2019", "all") + assert utils.is_date_ambiguous("2019", "any") assert utils.is_date_ambiguous("201X", "year") assert utils.is_date_ambiguous("2019-XX", "month") assert utils.is_date_ambiguous("2019-10", "day") + assert utils.is_date_ambiguous("2019-XX", "any") + assert utils.is_date_ambiguous("2019-XX", "day") # Test complete date strings without ambiguous dates for the requested field. - assert not utils.is_date_ambiguous("2019-09-03", "all") + assert not utils.is_date_ambiguous("2019-09-03", "any") assert not utils.is_date_ambiguous("2019-03-XX", "month") assert not utils.is_date_ambiguous("2019-09-03", "day") assert not utils.is_date_ambiguous("2019-XX-XX", "year")