From c12c8eecddcd67a2dc919419f2e96619545b6ae7 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 27 Jan 2023 15:47:03 -0800 Subject: [PATCH] refine: Re-implement --year-bounds with error handling Previously, the value specified was unused in the code. This restores the functionality. I moved the min_max_year argument from the constructor to range() since it is only used there. Fixes https://github.com/nextstrain/augur/issues/1136. --- augur/dates/__init__.py | 2 +- augur/dates/ambiguous_date.py | 65 ++++++++++++++++-- augur/dates/errors.py | 3 + augur/refine.py | 8 ++- tests/dates/test_ambiguous_date.py | 35 ++++++++++ .../refine/cram/year-bounds-error.t | 50 ++++++++++++++ tests/functional/refine/cram/year-bounds.t | 66 +++++++++++++++++++ 7 files changed, 222 insertions(+), 7 deletions(-) create mode 100644 tests/functional/refine/cram/year-bounds-error.t create mode 100644 tests/functional/refine/cram/year-bounds.t diff --git a/augur/dates/__init__.py b/augur/dates/__init__.py index 472d208be..aaf448036 100644 --- a/augur/dates/__init__.py +++ b/augur/dates/__init__.py @@ -117,7 +117,7 @@ def get_numerical_date_from_value(value, fmt=None, min_max_year=None): value = fmt.replace('%Y', value).replace('%m', 'XX').replace('%d', 'XX') if 'XX' in value: try: - ambig_date = AmbiguousDate(value, fmt=fmt, min_max_year=min_max_year).range() + ambig_date = AmbiguousDate(value, fmt=fmt).range(min_max_year=min_max_year) except InvalidDate as error: raise AugurError(str(error)) from error if ambig_date is None or None in ambig_date: diff --git a/augur/dates/ambiguous_date.py b/augur/dates/ambiguous_date.py index 965400627..9d111ef45 100644 --- a/augur/dates/ambiguous_date.py +++ b/augur/dates/ambiguous_date.py @@ -3,7 +3,7 @@ import functools import re -from .errors import InvalidDate +from .errors import InvalidDate, InvalidYearBounds def tuple_to_date(year, month, day): @@ -41,14 +41,13 @@ def resolve_uncertain_int(uncertain_string, min_or_max): class AmbiguousDate: """Transforms a date string with uncertainty into the range of possible dates.""" - def __init__(self, uncertain_date, fmt="%Y-%m-%d", min_max_year=None): + def __init__(self, uncertain_date, fmt="%Y-%m-%d"): self.uncertain_date = uncertain_date self.fmt = fmt - self.min_max_year = min_max_year self.assert_only_less_significant_uncertainty() - def range(self): + def range(self, min_max_year=None): """Return the range of possible dates defined by the ambiguous date. Impose an upper limit of today's date. @@ -65,6 +64,24 @@ def range(self): resolve_uncertain_int(self.uncertain_date_components["d"], "max"), ) + # Limit dates with ambiguous years to the given bounds. + if "X" in self.uncertain_date_components["Y"] and min_max_year: + lower_bound, upper_bound = get_bounds(min_max_year) + if lower_bound: + # lower_bound should always be truth-y, but add indentation here for readability. + if max_date < lower_bound: + raise InvalidDate(self.uncertain_date, f"Not possible for date to fall within bounds [{lower_bound}, {upper_bound}]") + + if min_date < lower_bound: + min_date = lower_bound + + if upper_bound: + if upper_bound < min_date: + raise InvalidDate(self.uncertain_date, f"Not possible for date to fall within bounds [{lower_bound}, {upper_bound}]") + + if max_date > upper_bound: + max_date = upper_bound + # Limit the min and max dates to be no later than today's date. min_date = min(min_date, datetime.date.today()) max_date = min(max_date, datetime.date.today()) @@ -131,3 +148,43 @@ def assert_only_less_significant_uncertainty(self): raise InvalidDate(self.uncertain_date, "Month contains uncertainty, so day must also be uncertain." ) + + +def get_bounds(min_max_year): + """Get exact date bounds based on given years.""" + # This must be an iterable with at least one value. + assert len(min_max_year) > 0 + + if len(min_max_year) > 2: + raise InvalidYearBounds(f"The year bounds {min_max_year!r} must have only one (lower) or two (lower, upper) bounds.") + + lower_year = int(min_max_year[0]) + + if len(min_max_year) == 2: + upper_year = int(min_max_year[1]) + else: + upper_year = None + + # Ensure years are properly ordered. + if lower_year and upper_year and lower_year > upper_year: + lower_year, upper_year = upper_year, lower_year + + try: + lower_bound = datetime.date(lower_year, 1, 1) + except ValueError as error: + if str(error).startswith("year"): + raise InvalidYearBounds(f"{lower_year} is not a valid year.") from error + else: + raise + if upper_year: + try: + upper_bound = datetime.date(upper_year, 12, 31) + except ValueError as error: + if str(error).startswith("year"): + raise InvalidYearBounds(f"{upper_year} is not a valid year.") from error + else: + raise + else: + upper_bound = None + + return (lower_bound, upper_bound) diff --git a/augur/dates/errors.py b/augur/dates/errors.py index 4fe3e981a..4a500ffd3 100644 --- a/augur/dates/errors.py +++ b/augur/dates/errors.py @@ -9,3 +9,6 @@ def __init__(self, date, message): def __str__(self): """Return a human-readable summary of the error.""" return f"Invalid date {self.date!r}: {self.message}" + +class InvalidYearBounds(Exception): + """Custom exception class to handle year bounds in unsupported formats.""" diff --git a/augur/refine.py b/augur/refine.py index a24a441c8..382cfa557 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -5,6 +5,7 @@ import sys from Bio import Phylo from .dates import get_numerical_dates +from .dates.errors import InvalidYearBounds from .io.metadata import read_metadata from .utils import read_tree, write_json, InvalidTreeError from .errors import AugurError @@ -206,8 +207,11 @@ def run(args): metadata = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() - dates = get_numerical_dates(metadata, fmt=args.date_format, - min_max_year=args.year_bounds) + try: + dates = get_numerical_dates(metadata, fmt=args.date_format, + min_max_year=args.year_bounds) + except InvalidYearBounds as error: + raise AugurError(f"Invalid value for --year-bounds: {error}") # save input state string for later export for n in T.get_terminals(): diff --git a/tests/dates/test_ambiguous_date.py b/tests/dates/test_ambiguous_date.py index ce8f90370..11ca9aaee 100644 --- a/tests/dates/test_ambiguous_date.py +++ b/tests/dates/test_ambiguous_date.py @@ -81,3 +81,38 @@ def test_resolve_uncertain_int(self, date_str, min_or_max, expected): def test_assert_only_less_significant_uncertainty(self, date_str, expected_error): with pytest.raises(InvalidDate, match=expected_error): AmbiguousDate(date_str) + + @freeze_time("2020-05-05") + @pytest.mark.parametrize( + "date_str, min_max_year, expected_range", + [ + ("20XX-XX-XX", [2000, 2010], (datetime.date(2000, 1, 1), datetime.date(2010, 12, 31))), + + # This option does not apply when the year is exact. However, a separate limit of the current date is applied. + ("2020-XX-XX", [2000, 2010], (datetime.date(2020, 1, 1), datetime.date(2020, 5, 5))), + ("2020-12-XX", [2000, 2010], (datetime.date(2020, 5, 5), datetime.date(2020, 5, 5))), + ("2020-12-01", [2000, 2010], (datetime.date(2020, 5, 5), datetime.date(2020, 5, 5))), + + # The upper bound is in the future, which is valid. However, a separate limit of the current date is applied. + ("20XX-XX-XX", [2010, 2030], (datetime.date(2010, 1, 1), datetime.date(2020, 5, 5))), + + # When there is no upper bound, a date can appear in the future. However, a separate limit of the current date is applied. + ("20XX-XX-XX", [2010], (datetime.date(2010, 1, 1), datetime.date(2020, 5, 5))), + ], + ) + def test_min_max_year(self, date_str, min_max_year, expected_range): + assert ( + AmbiguousDate(date_str).range(min_max_year=min_max_year) == expected_range + ) + + @freeze_time("2020-05-05") + @pytest.mark.parametrize( + "date_str, min_max_year", + [ + ("20XX-XX-XX", [1950, 1960]), + ("19XX-XX-XX", [2000, 2010]), + ], + ) + def test_min_max_year_date_error(self, date_str, min_max_year): + with pytest.raises(InvalidDate, match="Not possible for date to fall within bounds"): + AmbiguousDate(date_str).range(min_max_year=min_max_year) diff --git a/tests/functional/refine/cram/year-bounds-error.t b/tests/functional/refine/cram/year-bounds-error.t new file mode 100644 index 000000000..935fde5a3 --- /dev/null +++ b/tests/functional/refine/cram/year-bounds-error.t @@ -0,0 +1,50 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Create metadata with a strain that has partial ambiguity on the century-level +(20XX) so that --year-bounds is applied. + + $ cat >metadata.tsv <<~~ + > strain date + > PAN/CDC_259359_V1_V3/2015 20XX-XX-XX + > ~~ + + +Check that invalid --year-bounds provides useful error messages. + + $ ${AUGUR} refine \ + > --tree "$TESTDIR/../data/tree_raw.nwk" \ + > --alignment "$TESTDIR/../data/aligned.fasta" \ + > --metadata metadata.tsv \ + > --output-tree tree.nwk \ + > --output-node-data branch_lengths.json \ + > --timetree \ + > --year-bounds 1950 1960 1970 \ + > --divergence-units mutations > /dev/null + ERROR: Invalid value for --year-bounds: The year bounds [1950, 1960, 1970] must have only one (lower) or two (lower, upper) bounds. + [2] + + $ ${AUGUR} refine \ + > --tree "$TESTDIR/../data/tree_raw.nwk" \ + > --alignment "$TESTDIR/../data/aligned.fasta" \ + > --metadata metadata.tsv \ + > --output-tree tree.nwk \ + > --output-node-data branch_lengths.json \ + > --timetree \ + > --year-bounds 0 1000 \ + > --divergence-units mutations > /dev/null + ERROR: Invalid value for --year-bounds: 0 is not a valid year. + [2] + + $ ${AUGUR} refine \ + > --tree "$TESTDIR/../data/tree_raw.nwk" \ + > --alignment "$TESTDIR/../data/aligned.fasta" \ + > --metadata metadata.tsv \ + > --output-tree tree.nwk \ + > --output-node-data branch_lengths.json \ + > --timetree \ + > --year-bounds -2000 -3000 \ + > --divergence-units mutations > /dev/null + ERROR: Invalid value for --year-bounds: -3000 is not a valid year. + [2] diff --git a/tests/functional/refine/cram/year-bounds.t b/tests/functional/refine/cram/year-bounds.t new file mode 100644 index 000000000..31197c33b --- /dev/null +++ b/tests/functional/refine/cram/year-bounds.t @@ -0,0 +1,66 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Create a copy of tests/functional/refine/data/metadata.tsv, adding partial ambiguity on the century-level (20XX) for the first strain PAN/CDC_259359_V1_V3/2015. + + $ cat >metadata.tsv <<~~ + > strain date + > PAN/CDC_259359_V1_V3/2015 20XX-XX-XX + > COL/FLR_00024/2015 2015-12-XX + > PRVABC59 2015-12-XX + > COL/FLR_00008/2015 2015-12-XX + > Colombia/2016/ZC204Se 2016-01-06 + > ZKC2/2016 2016-02-16 + > VEN/UF_1/2016 2016-03-25 + > DOM/2016/BB_0059 2016-04-04 + > BRA/2016/FC_6706 2016-04-08 + > DOM/2016/BB_0183 2016-04-18 + > EcEs062_16 2016-04-XX + > HND/2016/HU_ME59 2016-05-13 + > ~~ + +Limit ambiguous dates to be within (2000, 2020). + + $ ${AUGUR} refine \ + > --tree "$TESTDIR/../data/tree_raw.nwk" \ + > --alignment "$TESTDIR/../data/aligned.fasta" \ + > --metadata metadata.tsv \ + > --output-tree tree.nwk \ + > --output-node-data branch_lengths.json \ + > --timetree \ + > --year-bounds 2000 2020 \ + > --coalescent opt \ + > --date-confidence \ + > --date-inference marginal \ + > --clock-filter-iqd 4 \ + > --seed 314159 \ + > --divergence-units mutations &> /dev/null + +Check that the inferred date is 2020-12-31. +TODO: Using jq woud be cleaner, but requires an extra dev dependency. + + $ python3 -c 'import json, sys; print(json.load(sys.stdin)["nodes"]["PAN/CDC_259359_V1_V3/2015"]["date"])' < branch_lengths.json + 2020-12-31 + +Reverse the order to check that order does not matter. + + $ ${AUGUR} refine \ + > --tree "$TESTDIR/../data/tree_raw.nwk" \ + > --alignment "$TESTDIR/../data/aligned.fasta" \ + > --metadata metadata.tsv \ + > --output-tree tree.nwk \ + > --output-node-data branch_lengths.json \ + > --timetree \ + > --year-bounds 2020 2000 \ + > --coalescent opt \ + > --date-confidence \ + > --date-inference marginal \ + > --clock-filter-iqd 4 \ + > --seed 314159 \ + > --divergence-units mutations &> /dev/null + +Run the same check as above. + + $ python3 -c 'import json, sys; print(json.load(sys.stdin)["nodes"]["PAN/CDC_259359_V1_V3/2015"]["date"])' < branch_lengths.json + 2020-12-31