Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Break utils.ambiuous_date_to_date_range to new class #532

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added augur/util_support/__init__.py
Empty file.
123 changes: 123 additions & 0 deletions augur/util_support/date_disambiguator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import calendar
import datetime
import functools
import re


def tuple_to_date(year, month, day):
month = min(month, 12)
day = min(day, max_day_for_year_month(year, month))

return datetime.date(year=year, month=month, day=day)


def max_day_for_year_month(year, month):
return calendar.monthrange(year, month)[1]


def resolve_uncertain_int(uncertain_string, min_or_max):
"""
Takes a string representation of an integer with uncertain places
occupied by the character `X`. Returns the minimum or maximum
possible integer.
"""
if min_or_max == "min":
result = int(uncertain_string.replace("X", "0"))
elif min_or_max == "max":
result = int(uncertain_string.replace("X", "9"))
else:
raise "Tried to resolve an uncertain integer to something other than `min` or `max`."

if result == 0:
# A date component cannot be 0. Well, year can, but...
result = 1

return result


class DateDisambiguator:
"""Transforms a date string with uncertainty into the range of possible dates."""

def __init__(self, uncertain_date, fmt="%Y-%m-%d", min_max_year=None):
self.uncertain_date = uncertain_date
self.fmt = fmt
self.min_max_year = min_max_year

self.assert_only_less_significant_uncertainty()

def range(self):
min_date = tuple_to_date(
resolve_uncertain_int(self.uncertain_date_components["Y"], "min"),
resolve_uncertain_int(self.uncertain_date_components["m"], "min"),
resolve_uncertain_int(self.uncertain_date_components["d"], "min"),
)

max_date = tuple_to_date(
resolve_uncertain_int(self.uncertain_date_components["Y"], "max"),
resolve_uncertain_int(self.uncertain_date_components["m"], "max"),
resolve_uncertain_int(self.uncertain_date_components["d"], "max"),
)
max_date = min(max_date, datetime.date.today())

return (min_date, max_date)

@property
@functools.lru_cache()
def uncertain_date_components(self):
matches = re.search(self.regex, self.uncertain_date)

if matches is None:
raise ValueError(
f"Malformed uncertain date `{self.uncertain_date}` for format `{self.fmt}`"
)

return dict(zip(self.fmt_components, matches.groups()))

@property
@functools.lru_cache()
def fmt_components(self):
# The `re` module doesn't capture repeated groups, so we'll do it without regexes
return [component[0] for component in self.fmt.split("%") if len(component) > 0]

@property
def regex(self):
"""
Returns regex defined by the format string.
Currently only supports %Y, %m, and %d.
"""
return re.compile(
"^"
+ self.fmt.replace("%Y", "(....)")
.replace("%m", "(..?)")
.replace("%d", "(..?)")
+ "$"
)

def assert_only_less_significant_uncertainty(self):
"""
Raise an exception if a constrained digit appears in a less-significant place
than an uncertain digit.

Assuming %Y-%m-%d, these patterns are valid:
2000-01-01
2000-01-XX
2000-XX-XX

but this is invalid, because month is uncertain but day is constrained:
2000-XX-01

These invalid cases are assumed to be unintended use of the tool.
"""
if "X" in self.uncertain_date_components["Y"]:
if (
self.uncertain_date_components["m"] != "XX"
or self.uncertain_date_components["d"] != "XX"
):
raise ValueError(
"Invalid date: Year contains uncertainty, so month and day must also be uncertain."
)
elif "X" in self.uncertain_date_components["m"]:
if self.uncertain_date_components["d"] != "XX":
raise ValueError(
"Invalid date: Month contains uncertainty, so day must also be uncertain."
)
36 changes: 4 additions & 32 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import packaging.version as packaging_version
from .validate import validate, ValidateError, load_json_schema

from augur.util_support.date_disambiguator import DateDisambiguator

class AugurException(Exception):
pass

Expand Down Expand Up @@ -62,38 +64,8 @@ def get_json_name(args, default=None):
raise ValueError("Please specify a name for the JSON file containing the results.")


def ambiguous_date_to_date_range(mydate, fmt, min_max_year=None):
from datetime import datetime
sep = fmt.split('%')[1][-1]
min_date, max_date = {}, {}
today = datetime.today().date()

for val, field in zip(mydate.split(sep), fmt.split(sep+'%')):
f = 'year' if 'y' in field.lower() else ('day' if 'd' in field.lower() else 'month')
if 'XX' in val:
if f=='year':
if min_max_year:
min_date[f]=min_max_year[0]
if len(min_max_year)>1:
max_date[f]=min_max_year[1]
elif len(min_max_year)==1:
max_date[f]=4000 #will be replaced by 'today' below.
else:
return None, None
elif f=='month':
min_date[f]=1
max_date[f]=12
elif f=='day':
min_date[f]=1
max_date[f]=31
else:
min_date[f]=int(val)
max_date[f]=int(val)
max_date['day'] = min(max_date['day'], 31 if max_date['month'] in [1,3,5,7,8,10,12]
else 28 if max_date['month']==2 else 30)
lower_bound = datetime(year=min_date['year'], month=min_date['month'], day=min_date['day']).date()
upper_bound = datetime(year=max_date['year'], month=max_date['month'], day=max_date['day']).date()
return (lower_bound, upper_bound if upper_bound<today else today)
def ambiguous_date_to_date_range(uncertain_date, fmt, min_max_year=None):
return DateDisambiguator(uncertain_date, fmt=fmt, min_max_year=min_max_year).range()

def read_metadata(fname, query=None):
if not fname:
Expand Down
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from setuptools import setup
import setuptools
import sys

min_version = (3, 6)
Expand All @@ -26,7 +26,9 @@
with readme_file.open(encoding = "utf-8") as f:
long_description = f.read()

setup(


setuptools.setup(
name = "nextstrain-augur",
version = __version__,
author = "Nextstrain developers",
Expand All @@ -41,7 +43,7 @@
"Change Log": "https://github.com/nextstrain/augur/blob/master/CHANGES.md#next",
"Source": "https://github.com/nextstrain/augur",
},
packages = ['augur'],
packages = setuptools.find_packages(),
package_data = {'augur': ['data/*']},
data_files = [("", ["LICENSE.txt"])],
python_requires = '>={}'.format('.'.join(str(n) for n in min_version)),
Expand Down
8 changes: 1 addition & 7 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@ def test_ambiguous_date_to_date_range_ambiguous_day(self):
datetime.date(year=2000, month=1, day=31),
)

def test_ambiguous_date_to_date_range_ambiguous_month(self):
assert utils.ambiguous_date_to_date_range("2000-XX-5", "%Y-%m-%d") == (
datetime.date(year=2000, month=1, day=5),
datetime.date(year=2000, month=12, day=5),
)

def test_ambiguous_date_to_date_range_ambiguous_month_and_day(self):
assert utils.ambiguous_date_to_date_range("2000-XX-XX", "%Y-%m-%d") == (
datetime.date(year=2000, month=1, day=1),
Expand Down Expand Up @@ -127,4 +121,4 @@ def test_read_metadata_bad_query(self, tmpdir):
with open(meta_fn, "w") as fh:
fh.write("\n".join(meta_lines))
with pytest.raises(SystemExit):
utils.read_metadata(meta_fn, query='badcol=="goodval"')
utils.read_metadata(meta_fn, query='badcol=="goodval"')
82 changes: 82 additions & 0 deletions tests/util_support/test_date_disambiguator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import datetime

from augur.util_support import date_disambiguator
from augur.util_support.date_disambiguator import DateDisambiguator

from freezegun import freeze_time
import pytest


class TestDateDisambiguator:
@freeze_time("2111-05-05")
@pytest.mark.parametrize(
"date_str, expected_range",
[
("2000-01-01", (datetime.date(2000, 1, 1), datetime.date(2000, 1, 1))),
("2000-02-XX", (datetime.date(2000, 2, 1), datetime.date(2000, 2, 29))),
("2000-XX-XX", (datetime.date(2000, 1, 1), datetime.date(2000, 12, 31))),
],
)
def test_range(self, date_str, expected_range):
assert DateDisambiguator(date_str).range() == expected_range

@pytest.mark.parametrize(
"date_str, fmt",
[
("2005-02-XX", "%Y-%m-%d"),
("2005/02/XX", "%Y/%m/%d"),
("2005-XX-02", "%Y-%d-%m"),
("200502XX", "%Y%m%d"),
],
)
def test_range_separators(self, date_str, fmt):
assert DateDisambiguator(date_str, fmt=fmt).range() == (
datetime.date(2005, 2, 1),
datetime.date(2005, 2, 28),
)

@pytest.mark.parametrize(
"date_str, expected_components",
[
("2000-01-01", {"Y": "2000", "m": "01", "d": "01"}),
("2000-01-XX", {"Y": "2000", "m": "01", "d": "XX"}),
("2000-XX-XX", {"Y": "2000", "m": "XX", "d": "XX"}),
],
)
def test_uncertain_date_components(self, date_str, expected_components):
assert (
DateDisambiguator(date_str).uncertain_date_components == expected_components
)

def test_uncertain_date_components_error(self):
with pytest.raises(ValueError, match="Malformed uncertain date"):
DateDisambiguator("5-5-5-5-5").uncertain_date_components

@pytest.mark.parametrize(
"date_str, min_or_max, expected",
[
("2000", "min", 2000),
("2000", "max", 2000),
("200X", "min", 2000),
("200X", "max", 2009),
("20X0", "max", 2090),
("X000", "max", 9000),
("XXXX", "min", 1),
("XXXX", "max", 9999),
],
)
def test_resolve_uncertain_int(self, date_str, min_or_max, expected):
assert (
date_disambiguator.resolve_uncertain_int(date_str, min_or_max) == expected
)

@pytest.mark.parametrize(
"date_str, expected_error",
[
("200X-01-01", "so month and day must also be uncertain"),
("2000-XX-01", "so day must also be uncertain"),
],
)
def test_assert_only_less_significant_uncertainty(self, date_str, expected_error):
with pytest.raises(ValueError, match=expected_error):
DateDisambiguator(date_str)