Skip to content

Commit

Permalink
refine: Re-implement --year-bounds with error handling
Browse files Browse the repository at this point in the history
Previously, the value specified was unused in the code. This restores
the functionality.

I moved the min_max_year argument from the constructor to range() since
it is only used there.

Fixes #1136.
  • Loading branch information
victorlin committed Mar 29, 2023
1 parent c4533b0 commit c12c8ee
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 7 deletions.
2 changes: 1 addition & 1 deletion augur/dates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
value = fmt.replace('%Y', value).replace('%m', 'XX').replace('%d', 'XX')
if 'XX' in value:
try:
ambig_date = AmbiguousDate(value, fmt=fmt, min_max_year=min_max_year).range()
ambig_date = AmbiguousDate(value, fmt=fmt).range(min_max_year=min_max_year)
except InvalidDate as error:
raise AugurError(str(error)) from error
if ambig_date is None or None in ambig_date:
Expand Down
65 changes: 61 additions & 4 deletions augur/dates/ambiguous_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import functools
import re

from .errors import InvalidDate
from .errors import InvalidDate, InvalidYearBounds


def tuple_to_date(year, month, day):
Expand Down Expand Up @@ -41,14 +41,13 @@ def resolve_uncertain_int(uncertain_string, min_or_max):
class AmbiguousDate:
"""Transforms a date string with uncertainty into the range of possible dates."""

def __init__(self, uncertain_date, fmt="%Y-%m-%d", min_max_year=None):
def __init__(self, uncertain_date, fmt="%Y-%m-%d"):
self.uncertain_date = uncertain_date
self.fmt = fmt
self.min_max_year = min_max_year

self.assert_only_less_significant_uncertainty()

def range(self):
def range(self, min_max_year=None):
"""Return the range of possible dates defined by the ambiguous date.
Impose an upper limit of today's date.
Expand All @@ -65,6 +64,24 @@ def range(self):
resolve_uncertain_int(self.uncertain_date_components["d"], "max"),
)

# Limit dates with ambiguous years to the given bounds.
if "X" in self.uncertain_date_components["Y"] and min_max_year:
lower_bound, upper_bound = get_bounds(min_max_year)
if lower_bound:
# lower_bound should always be truth-y, but add indentation here for readability.
if max_date < lower_bound:
raise InvalidDate(self.uncertain_date, f"Not possible for date to fall within bounds [{lower_bound}, {upper_bound}]")

if min_date < lower_bound:
min_date = lower_bound

if upper_bound:
if upper_bound < min_date:
raise InvalidDate(self.uncertain_date, f"Not possible for date to fall within bounds [{lower_bound}, {upper_bound}]")

if max_date > upper_bound:
max_date = upper_bound

# Limit the min and max dates to be no later than today's date.
min_date = min(min_date, datetime.date.today())
max_date = min(max_date, datetime.date.today())
Expand Down Expand Up @@ -131,3 +148,43 @@ def assert_only_less_significant_uncertainty(self):
raise InvalidDate(self.uncertain_date,
"Month contains uncertainty, so day must also be uncertain."
)


def get_bounds(min_max_year):
"""Get exact date bounds based on given years."""
# This must be an iterable with at least one value.
assert len(min_max_year) > 0

if len(min_max_year) > 2:
raise InvalidYearBounds(f"The year bounds {min_max_year!r} must have only one (lower) or two (lower, upper) bounds.")

lower_year = int(min_max_year[0])

if len(min_max_year) == 2:
upper_year = int(min_max_year[1])
else:
upper_year = None

# Ensure years are properly ordered.
if lower_year and upper_year and lower_year > upper_year:
lower_year, upper_year = upper_year, lower_year

try:
lower_bound = datetime.date(lower_year, 1, 1)
except ValueError as error:
if str(error).startswith("year"):
raise InvalidYearBounds(f"{lower_year} is not a valid year.") from error
else:
raise
if upper_year:
try:
upper_bound = datetime.date(upper_year, 12, 31)
except ValueError as error:
if str(error).startswith("year"):
raise InvalidYearBounds(f"{upper_year} is not a valid year.") from error
else:
raise
else:
upper_bound = None

return (lower_bound, upper_bound)
3 changes: 3 additions & 0 deletions augur/dates/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ def __init__(self, date, message):
def __str__(self):
"""Return a human-readable summary of the error."""
return f"Invalid date {self.date!r}: {self.message}"

class InvalidYearBounds(Exception):
"""Custom exception class to handle year bounds in unsupported formats."""
8 changes: 6 additions & 2 deletions augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
from Bio import Phylo
from .dates import get_numerical_dates
from .dates.errors import InvalidYearBounds
from .io.metadata import read_metadata
from .utils import read_tree, write_json, InvalidTreeError
from .errors import AugurError
Expand Down Expand Up @@ -206,8 +207,11 @@ def run(args):
metadata = read_metadata(args.metadata)
if args.year_bounds:
args.year_bounds.sort()
dates = get_numerical_dates(metadata, fmt=args.date_format,
min_max_year=args.year_bounds)
try:
dates = get_numerical_dates(metadata, fmt=args.date_format,
min_max_year=args.year_bounds)
except InvalidYearBounds as error:
raise AugurError(f"Invalid value for --year-bounds: {error}")

# save input state string for later export
for n in T.get_terminals():
Expand Down
35 changes: 35 additions & 0 deletions tests/dates/test_ambiguous_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,38 @@ def test_resolve_uncertain_int(self, date_str, min_or_max, expected):
def test_assert_only_less_significant_uncertainty(self, date_str, expected_error):
with pytest.raises(InvalidDate, match=expected_error):
AmbiguousDate(date_str)

@freeze_time("2020-05-05")
@pytest.mark.parametrize(
"date_str, min_max_year, expected_range",
[
("20XX-XX-XX", [2000, 2010], (datetime.date(2000, 1, 1), datetime.date(2010, 12, 31))),
# This option does not apply when the year is exact. However, a separate limit of the current date is applied.
("2020-XX-XX", [2000, 2010], (datetime.date(2020, 1, 1), datetime.date(2020, 5, 5))),
("2020-12-XX", [2000, 2010], (datetime.date(2020, 5, 5), datetime.date(2020, 5, 5))),
("2020-12-01", [2000, 2010], (datetime.date(2020, 5, 5), datetime.date(2020, 5, 5))),
# The upper bound is in the future, which is valid. However, a separate limit of the current date is applied.
("20XX-XX-XX", [2010, 2030], (datetime.date(2010, 1, 1), datetime.date(2020, 5, 5))),
# When there is no upper bound, a date can appear in the future. However, a separate limit of the current date is applied.
("20XX-XX-XX", [2010], (datetime.date(2010, 1, 1), datetime.date(2020, 5, 5))),
],
)
def test_min_max_year(self, date_str, min_max_year, expected_range):
assert (
AmbiguousDate(date_str).range(min_max_year=min_max_year) == expected_range
)

@freeze_time("2020-05-05")
@pytest.mark.parametrize(
"date_str, min_max_year",
[
("20XX-XX-XX", [1950, 1960]),
("19XX-XX-XX", [2000, 2010]),
],
)
def test_min_max_year_date_error(self, date_str, min_max_year):
with pytest.raises(InvalidDate, match="Not possible for date to fall within bounds"):
AmbiguousDate(date_str).range(min_max_year=min_max_year)
50 changes: 50 additions & 0 deletions tests/functional/refine/cram/year-bounds-error.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
Setup

$ source "$TESTDIR"/_setup.sh

Create metadata with a strain that has partial ambiguity on the century-level
(20XX) so that --year-bounds is applied.

$ cat >metadata.tsv <<~~
> strain date
> PAN/CDC_259359_V1_V3/2015 20XX-XX-XX
> ~~


Check that invalid --year-bounds provides useful error messages.

$ ${AUGUR} refine \
> --tree "$TESTDIR/../data/tree_raw.nwk" \
> --alignment "$TESTDIR/../data/aligned.fasta" \
> --metadata metadata.tsv \
> --output-tree tree.nwk \
> --output-node-data branch_lengths.json \
> --timetree \
> --year-bounds 1950 1960 1970 \
> --divergence-units mutations > /dev/null
ERROR: Invalid value for --year-bounds: The year bounds [1950, 1960, 1970] must have only one (lower) or two (lower, upper) bounds.
[2]

$ ${AUGUR} refine \
> --tree "$TESTDIR/../data/tree_raw.nwk" \
> --alignment "$TESTDIR/../data/aligned.fasta" \
> --metadata metadata.tsv \
> --output-tree tree.nwk \
> --output-node-data branch_lengths.json \
> --timetree \
> --year-bounds 0 1000 \
> --divergence-units mutations > /dev/null
ERROR: Invalid value for --year-bounds: 0 is not a valid year.
[2]

$ ${AUGUR} refine \
> --tree "$TESTDIR/../data/tree_raw.nwk" \
> --alignment "$TESTDIR/../data/aligned.fasta" \
> --metadata metadata.tsv \
> --output-tree tree.nwk \
> --output-node-data branch_lengths.json \
> --timetree \
> --year-bounds -2000 -3000 \
> --divergence-units mutations > /dev/null
ERROR: Invalid value for --year-bounds: -3000 is not a valid year.
[2]
66 changes: 66 additions & 0 deletions tests/functional/refine/cram/year-bounds.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
Setup

$ source "$TESTDIR"/_setup.sh

Create a copy of tests/functional/refine/data/metadata.tsv, adding partial ambiguity on the century-level (20XX) for the first strain PAN/CDC_259359_V1_V3/2015.

$ cat >metadata.tsv <<~~
> strain date
> PAN/CDC_259359_V1_V3/2015 20XX-XX-XX
> COL/FLR_00024/2015 2015-12-XX
> PRVABC59 2015-12-XX
> COL/FLR_00008/2015 2015-12-XX
> Colombia/2016/ZC204Se 2016-01-06
> ZKC2/2016 2016-02-16
> VEN/UF_1/2016 2016-03-25
> DOM/2016/BB_0059 2016-04-04
> BRA/2016/FC_6706 2016-04-08
> DOM/2016/BB_0183 2016-04-18
> EcEs062_16 2016-04-XX
> HND/2016/HU_ME59 2016-05-13
> ~~

Limit ambiguous dates to be within (2000, 2020).

$ ${AUGUR} refine \
> --tree "$TESTDIR/../data/tree_raw.nwk" \
> --alignment "$TESTDIR/../data/aligned.fasta" \
> --metadata metadata.tsv \
> --output-tree tree.nwk \
> --output-node-data branch_lengths.json \
> --timetree \
> --year-bounds 2000 2020 \
> --coalescent opt \
> --date-confidence \
> --date-inference marginal \
> --clock-filter-iqd 4 \
> --seed 314159 \
> --divergence-units mutations &> /dev/null

Check that the inferred date is 2020-12-31.
TODO: Using jq woud be cleaner, but requires an extra dev dependency.

$ python3 -c 'import json, sys; print(json.load(sys.stdin)["nodes"]["PAN/CDC_259359_V1_V3/2015"]["date"])' < branch_lengths.json
2020-12-31

Reverse the order to check that order does not matter.

$ ${AUGUR} refine \
> --tree "$TESTDIR/../data/tree_raw.nwk" \
> --alignment "$TESTDIR/../data/aligned.fasta" \
> --metadata metadata.tsv \
> --output-tree tree.nwk \
> --output-node-data branch_lengths.json \
> --timetree \
> --year-bounds 2020 2000 \
> --coalescent opt \
> --date-confidence \
> --date-inference marginal \
> --clock-filter-iqd 4 \
> --seed 314159 \
> --divergence-units mutations &> /dev/null

Run the same check as above.

$ python3 -c 'import json, sys; print(json.load(sys.stdin)["nodes"]["PAN/CDC_259359_V1_V3/2015"]["date"])' < branch_lengths.json
2020-12-31

0 comments on commit c12c8ee

Please sign in to comment.