Skip to content

Commit

Permalink
Move date logic from filter to new dates module
Browse files Browse the repository at this point in the history
With #740, filter's numeric_date() provides support for 3 date formats.
However, supporting various date formats is not specific to filter (e.g. frequencies has a separate numeric_date() which is now out-of-sync with this filter's numeric_date()).

This commit:

1. Moves numeric_date() to a new submodule augur.dates
2. Moves the related SUPPORTED_DATE_HELP_TEXT to augur.dates
3. Updates numeric_date() to raise a TypeError rather than argparse.ArgumentTypeError so it can be generalized to non-argparse usage
4. Adds a new function numeric_date_type() which wraps numeric_date() and raises an argparse.ArgumentTypeError per #740 (comment)
  • Loading branch information
victorlin committed Apr 15, 2022
1 parent 24b96e6 commit 2b9ba06
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 57 deletions.
65 changes: 65 additions & 0 deletions augur/dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import datetime
from textwrap import dedent
import isodate
import treetime.utils

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")

def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.
*date* may be given as:
1. A string or float (number) with year as the integer part
2. A string in the YYYY-MM-DD (ISO 8601) syntax
3. A string representing a relative date (duration before datetime.date.today())
>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
>>> import datetime, isodate, treetime
>>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W"))
True
"""
# date is numeric
try:
return float(date)
except ValueError:
pass

# date is in YYYY-MM-DD form
try:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
except ValueError:
pass

# date is a duration treated as a backwards-looking relative date
try:
# make a copy of date for this block
duration_str = str(date)
if duration_str.startswith('P'):
duration_str = duration_str
else:
duration_str = 'P'+duration_str
return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str))
except (ValueError, isodate.ISO8601Error):
pass

raise ValueError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""")

def numeric_date_type(date):
"""Wraps numeric_date() for argparse usage.
This raises an ArgumentTypeError, otherwise the custom exception message won't be shown in console output due to:
https://github.com/python/cpython/blob/5c4d1f6e0e192653560ae2941a6677fbf4fbd1f2/Lib/argparse.py#L2503-L2513
"""
try:
return numeric_date(date)
except ValueError as e:
raise argparse.ArgumentTypeError(str(e)) from e
60 changes: 3 additions & 57 deletions augur/filter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
"""
Filter and subsample a sequence set.
"""
import argparse
from Bio import SeqIO
from collections import defaultdict
import csv
import datetime
import heapq
import isodate
import itertools
import json
import numpy as np
Expand All @@ -18,10 +15,9 @@
import re
import sys
from tempfile import NamedTemporaryFile
import treetime.utils
from textwrap import dedent
from typing import Collection

from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from .index import index_sequences, index_vcf
from .io import open_file, read_metadata, read_sequences, write_sequences
from .utils import is_vcf as filename_is_vcf, read_vcf, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
Expand All @@ -33,12 +29,6 @@
"non_nucleotide",
)

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")


class FilterException(Exception):
"""Representation of an error that occurred during filtering.
Expand Down Expand Up @@ -1118,8 +1108,8 @@ def register_arguments(parser):
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
metadata_filter_group.add_argument('--min-date', type=numeric_date, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--max-date', type=numeric_date, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
Expand Down Expand Up @@ -1694,50 +1684,6 @@ def _filename_gz(filename):
return filename.lower().endswith(".gz")


def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.
*date* may be given as:
1. A string or float (number) with year as the integer part
2. A string in the YYYY-MM-DD (ISO 8601) syntax
3. A string representing a relative date (duration before datetime.date.today())
>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
>>> import datetime, isodate, treetime
>>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W"))
True
"""
# date is numeric
try:
return float(date)
except ValueError:
pass

# date is in YYYY-MM-DD form
try:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
except ValueError:
pass

# date is a duration treated as a backwards-looking relative date
try:
# make a copy of date for this block
duration_str = str(date)
if duration_str.startswith('P'):
duration_str = duration_str
else:
duration_str = 'P'+duration_str
return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str))
except (ValueError, isodate.ISO8601Error):
pass

raise argparse.ArgumentTypeError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""")


def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True):
"""Calculate the number of sequences per group for a given maximum number of
sequences to be returned and the number of sequences in each requested
Expand Down

0 comments on commit 2b9ba06

Please sign in to comment.