Skip to content

Commit

Permalink
refine: Re-implement --year-bounds with error handling
Browse files Browse the repository at this point in the history
Previously, the value specified was unused in the code. This restores
the functionality.

I moved the min_max_year argument from the constructor to range() since
it is only used there.

Fixes #1136.
  • Loading branch information
victorlin committed Feb 27, 2023
1 parent 15d065a commit a174f6b
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 4 deletions.
2 changes: 1 addition & 1 deletion augur/dates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
value = fmt.replace('%Y', value).replace('%m', 'XX').replace('%d', 'XX')
if 'XX' in value:
try:
ambig_date = AmbiguousDate(value, fmt=fmt, min_max_year=min_max_year).range()
ambig_date = AmbiguousDate(value, fmt=fmt).range(min_max_year=min_max_year)
except InvalidDate as error:
raise AugurError(str(error)) from error
if ambig_date is None or None in ambig_date:
Expand Down
53 changes: 50 additions & 3 deletions augur/dates/ambiguous_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import functools
import re

from augur.errors import AugurError
from .errors import InvalidDate


Expand Down Expand Up @@ -41,14 +42,13 @@ def resolve_uncertain_int(uncertain_string, min_or_max):
class AmbiguousDate:
"""Transforms a date string with uncertainty into the range of possible dates."""

def __init__(self, uncertain_date, fmt="%Y-%m-%d", min_max_year=None):
def __init__(self, uncertain_date, fmt="%Y-%m-%d"):
self.uncertain_date = uncertain_date
self.fmt = fmt
self.min_max_year = min_max_year

self.assert_only_less_significant_uncertainty()

def range(self):
def range(self, min_max_year=None):
"""Return the range of possible dates defined by the ambiguous date.
Impose an upper limit of today's date.
Expand All @@ -65,6 +65,24 @@ def range(self):
resolve_uncertain_int(self.uncertain_date_components["d"], "max"),
)

# Limit dates with ambiguous years to the given bounds.
if "X" in self.uncertain_date_components["Y"] and min_max_year:
lower_bound, upper_bound = get_bounds(min_max_year)
if lower_bound:
# lower_bound should always be truth-y, but add indentation here for readability.
if max_date < lower_bound:
raise InvalidDate(self.uncertain_date, f"Not possible for date to fall within bounds [{lower_bound}, {upper_bound}]")

if min_date < lower_bound:
min_date = lower_bound

if upper_bound:
if upper_bound < min_date:
raise InvalidDate(self.uncertain_date, f"Not possible for date to fall within bounds [{lower_bound}, {upper_bound}]")

if max_date > upper_bound:
max_date = upper_bound

# Limit the min and max dates to be no later than today's date.
min_date = min(min_date, datetime.date.today())
max_date = min(max_date, datetime.date.today())
Expand Down Expand Up @@ -131,3 +149,32 @@ def assert_only_less_significant_uncertainty(self):
raise InvalidDate(self.uncertain_date,
"Month contains uncertainty, so day must also be uncertain."
)


def get_bounds(min_max_year):
"""Get exact date bounds based on given years."""
# This must be an iterable with at least one value.
assert len(min_max_year) > 0


if len(min_max_year) > 2:
raise AugurError(f"The year bounds {min_max_year!r} must have only one (lower) or two (lower, upper) bounds.")

try:
lower_bound = datetime.date(int(min_max_year[0]), 1, 1)
except:
raise AugurError(f"{min_max_year[0]!r} is not a valid year.")

if len(min_max_year) == 2:
try:
upper_bound = datetime.date(int(min_max_year[1]), 12, 31)
except:
raise AugurError(f"{min_max_year[1]!r} is not a valid year.")
else:
upper_bound = None

# Ensure bounds are properly ordered.
if lower_bound and upper_bound and lower_bound > upper_bound:
lower_bound, upper_bound = upper_bound, lower_bound

return (lower_bound, upper_bound)
48 changes: 48 additions & 0 deletions tests/dates/test_ambiguous_date.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import datetime
import re

from augur.dates import ambiguous_date
from augur.dates.ambiguous_date import AmbiguousDate
from augur.dates.errors import InvalidDate
from augur.errors import AugurError

from freezegun import freeze_time
import pytest
Expand Down Expand Up @@ -81,3 +83,49 @@ def test_resolve_uncertain_int(self, date_str, min_or_max, expected):
def test_assert_only_less_significant_uncertainty(self, date_str, expected_error):
with pytest.raises(InvalidDate, match=expected_error):
AmbiguousDate(date_str)

@freeze_time("2020-05-05")
@pytest.mark.parametrize(
"date_str, min_max_year, expected_range",
[
("20XX-XX-XX", [2000, 2010], (datetime.date(2000, 1, 1), datetime.date(2010, 12, 31))),
# This option does not apply when the year is exact. However, a separate limit of the current date is applied.
("2020-XX-XX", [2000, 2010], (datetime.date(2020, 1, 1), datetime.date(2020, 5, 5))),
("2020-12-XX", [2000, 2010], (datetime.date(2020, 5, 5), datetime.date(2020, 5, 5))),
("2020-12-01", [2000, 2010], (datetime.date(2020, 5, 5), datetime.date(2020, 5, 5))),
# The upper bound is in the future, which is valid. However, a separate limit of the current date is applied.
("20XX-XX-XX", [2010, 2030], (datetime.date(2010, 1, 1), datetime.date(2020, 5, 5))),
# When there is no upper bound, a date can appear in the future. However, a separate limit of the current date is applied.
("20XX-XX-XX", [2010], (datetime.date(2010, 1, 1), datetime.date(2020, 5, 5))),
],
)
def test_min_max_year(self, date_str, min_max_year, expected_range):
assert (
AmbiguousDate(date_str).range(min_max_year=min_max_year) == expected_range
)

@freeze_time("2020-05-05")
@pytest.mark.parametrize(
"date_str, min_max_year",
[
("20XX-XX-XX", [1950, 1960]),
("19XX-XX-XX", [2000, 2010]),
],
)
def test_min_max_year_date_error(self, date_str, min_max_year):
with pytest.raises(InvalidDate, match="Not possible for date to fall within bounds"):
AmbiguousDate(date_str).range(min_max_year=min_max_year)

@freeze_time("2020-05-05")
@pytest.mark.parametrize(
"date_str, min_max_year",
[
("20XX-XX-XX", [1950, 1960, 1970]),
],
)
def test_min_max_year_augur_error(self, date_str, min_max_year):
with pytest.raises(AugurError, match=re.escape("must have only one (lower) or two (lower, upper) bounds")):
AmbiguousDate(date_str).range(min_max_year=min_max_year)
43 changes: 43 additions & 0 deletions tests/functional/refine.t
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,47 @@ This approach does not make sense and should not work without an alignment FASTA
*ERROR: alignment is required* (glob)
[1]

Ensure --year-bounds behaves as expected.

Create a copy of tests/functional/refine/metadata.tsv, adding partial ambiguity on the century-level (20XX) for the first strain PAN/CDC_259359_V1_V3/2015.

$ cat >"$TMP/metadata.tsv" <<~~
> strain date
> PAN/CDC_259359_V1_V3/2015 20XX-XX-XX
> COL/FLR_00024/2015 2015-12-XX
> PRVABC59 2015-12-XX
> COL/FLR_00008/2015 2015-12-XX
> Colombia/2016/ZC204Se 2016-01-06
> ZKC2/2016 2016-02-16
> VEN/UF_1/2016 2016-03-25
> DOM/2016/BB_0059 2016-04-04
> BRA/2016/FC_6706 2016-04-08
> DOM/2016/BB_0183 2016-04-18
> EcEs062_16 2016-04-XX
> HND/2016/HU_ME59 2016-05-13
> ~~

Limit ambiguous dates to be within (2000, 2020).

$ ${AUGUR} refine \
> --tree "refine/tree_raw.nwk" \
> --alignment "refine/aligned.fasta" \
> --metadata "$TMP/metadata.tsv" \
> --output-tree "$TMP/tree.nwk" \
> --output-node-data "$TMP/branch_lengths.json" \
> --timetree \
> --year-bounds 2000 2020 \
> --coalescent opt \
> --date-confidence \
> --date-inference marginal \
> --clock-filter-iqd 4 \
> --seed 314159 \
> --divergence-units mutations &> /dev/null

Check that the inferred date is 2020-12-31.
TODO: Using jq woud be cleaner, but requires an extra dev dependency.

$ python3 -c 'import json, sys; print(json.load(sys.stdin)["nodes"]["PAN/CDC_259359_V1_V3/2015"]["date"])' < "$TMP/branch_lengths.json"
2020-12-31

$ popd > /dev/null

0 comments on commit a174f6b

Please sign in to comment.