From 873a70a34105fd6579d2780d4dba946a1524363c Mon Sep 17 00:00:00 2001 From: Benjamin Otter Date: Thu, 24 Jun 2021 12:52:40 +0200 Subject: [PATCH 1/5] filter: Add support for relative dates in --min-date and --max-date The relative dates are parsed by `numeric_date` which uses datetime.date.today() to translate the relative date to an absolute date. Relative dates are positive duration values following the ISO 8601 duration syntax e.g. `--min-date 1Y2W5D` for 1 year, 2 weeks and 5 days ago or `--max-date 1D` for yesterday This also adds a package dependency `isodate` to parse the duration string. --- augur/filter.py | 41 +++++++++++++++++++++++++++++++++++++---- setup.py | 1 + 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index 197786168..28d269286 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -6,6 +6,7 @@ import csv import datetime import heapq +import isodate import itertools import json import numpy as np @@ -1109,8 +1110,16 @@ def register_arguments(parser): Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax. (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")""" ) - metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") - metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") + metadata_filter_group.add_argument('--min-date', type=numeric_date, + help="""minimal cutoff for date, the cutoff date is inclusive; may be specified as: + 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) + 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') + 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')""") + metadata_filter_group.add_argument('--max-date', type=numeric_date, + help="""maximal cutoff for date, the cutoff date is inclusive; may be specified as: + 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) + 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') + 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')""") metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").') metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude") @@ -1689,18 +1698,42 @@ def numeric_date(date): """ Converts the given *date* string to a :py:class:`float`. - *date* may be given as a number (a float) with year as the integer part, or - in the YYYY-MM-DD (ISO 8601) syntax. + *date* may be given as: + 1. A string or float (number) with year as the integer part + 2. A string in the YYYY-MM-DD (ISO 8601) syntax + 3. A string representing a relative date (duration before datetime.date.today()) >>> numeric_date("2020.42") 2020.42 >>> numeric_date("2020-06-04") 2020.42486... + >>> import datetime, isodate, treetime + >>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W")) + True """ + # date is numeric try: return float(date) except ValueError: + pass + + # date is in YYYY-MM-DD form + try: return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2)))) + except ValueError: + pass + + # date is a duration treated as a backwards-looking relative date + try: + # make a copy of date for this block + duration_str = str(date) + if duration_str.startswith('P'): + duration_str = duration_str + else: + duration_str = 'P'+duration_str + return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str)) + except (ValueError, isodate.ISO8601Error): + pass def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True): diff --git a/setup.py b/setup.py index 907168ae7..f5eda48a4 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ install_requires = [ "bcbio-gff >=0.6.0, ==0.6.*", "biopython >=1.67, !=1.77, !=1.78", + "isodate ==0.6.*", "jsonschema >=3.0.0, ==3.*", "networkx >= 2.5, ==2.*", "packaging >=19.2", From eafadeff58b2d98b19b62377d18e68ae16a7fd9d Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 29 Mar 2022 14:11:09 -0700 Subject: [PATCH 2/5] Add tests for relative dates --min-date and --max-date --- tests/test_filter.py | 91 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/tests/test_filter.py b/tests/test_filter.py index 68131f193..248c1706d 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -9,6 +9,8 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from freezegun import freeze_time + import augur.filter from augur.utils import read_metadata @@ -265,3 +267,92 @@ def test_filter_date_formats(self, tmpdir, fasta_fn, argparser): augur.filter.run(args) output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta")) assert list(output.keys()) == ["SEQ_1", "SEQ_2", "SEQ_3"] + + @freeze_time("2020-03-25") + @pytest.mark.parametrize( + "argparse_params, metadata_rows, output_sorted_expected", + [ + ( + "--min-date 1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--min-date P1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--min-date 2Y", + ( + ("SEQ_1","2017-03-25"), + ("SEQ_2","2018-03-25"), + ("SEQ_3","2019-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--min-date 4W", + ( + ("SEQ_1","2020-02-25"), + ("SEQ_2","2020-02-26"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--min-date 1Y2W5D", + ( + ("SEQ_1","2019-03-05"), + ("SEQ_2","2019-03-06"), + ("SEQ_3","2019-03-07"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date 1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ( + "--max-date P1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ( + "--max-date 1D", + ( + ("SEQ_1","2020-03-23"), + ("SEQ_2","2020-03-24"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ], + ) + def test_filter_relative_dates(self, tmpdir, argparser, argparse_params, metadata_rows, output_sorted_expected): + """Test that various relative dates work""" + out_fn = str(tmpdir / "filtered.txt") + meta_fn = write_metadata(tmpdir, (("strain","date"), + *metadata_rows)) + args = argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_params}') + augur.filter.run(args) + with open(out_fn) as f: + output_sorted = sorted(line.rstrip() for line in f) + assert output_sorted == output_sorted_expected From 6a88293550179b9de0344b65627cbd4f48085204 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 31 Mar 2022 16:09:15 -0700 Subject: [PATCH 3/5] Error on invalid --min-date/--max-date The previous change to support relative dates also refactored so that an invalid date does not raise any error, which is unwanted. Prior to that change, there was one try/catch so treetime.utils.numeric_date would handle all these invalid dates. Explicitly raising from augur.filter.numeric_date is one step in a better direction, though still not a good solution since argparse does not expose these errors. More: https://stackoverflow.com/q/38340252 --- augur/filter.py | 4 ++++ tests/test_filter.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/augur/filter.py b/augur/filter.py index 28d269286..d556e5fdd 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -1735,6 +1735,10 @@ def numeric_date(date): except (ValueError, isodate.ISO8601Error): pass + # This currently doesn't get exposed since argparse raises a SystemExit on invalid arguments. + # TODO: find a way to provide better errors for invalid dates. + raise ValueError(f"Unable to determine date from {date}.") + def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True): """Calculate the number of sequences per group for a given maximum number of diff --git a/tests/test_filter.py b/tests/test_filter.py index 248c1706d..f2f087579 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -356,3 +356,19 @@ def test_filter_relative_dates(self, tmpdir, argparser, argparse_params, metadat with open(out_fn) as f: output_sorted = sorted(line.rstrip() for line in f) assert output_sorted == output_sorted_expected + + @freeze_time("2020-03-25") + @pytest.mark.parametrize( + "argparse_params", + [ + "--max-date 3000Y", + "--max-date invalid", + ], + ) + def test_filter_relative_dates_error(self, tmpdir, argparser, argparse_params): + """Test that invalid dates fail""" + out_fn = str(tmpdir / "filtered.txt") + meta_fn = write_metadata(tmpdir, (("strain","date"), + ("SEQ_1","2020-03-23"))) + with pytest.raises(SystemExit): + argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_params}') From a2ac8e5ac3c80fa12cc777f9f8a7570b70b4cc77 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 6 Apr 2022 17:18:19 -0700 Subject: [PATCH 4/5] Improve error message for invalid dates - Expose the invalid date error message using argparse.ArgumentTypeError - Split out the text describing supported dates into a constant variable, since it is used in 3 places now --- augur/filter.py | 22 +++++++++++----------- tests/test_filter.py | 19 +++++++++++++------ 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index d556e5fdd..65e834704 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -1,6 +1,7 @@ """ Filter and subsample a sequence set. """ +import argparse from Bio import SeqIO from collections import defaultdict import csv @@ -18,6 +19,7 @@ import sys from tempfile import NamedTemporaryFile import treetime.utils +from textwrap import dedent from typing import Collection from .index import index_sequences, index_vcf @@ -31,6 +33,12 @@ "non_nucleotide", ) +SUPPORTED_DATE_HELP_TEXT = dedent("""\ + 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or + 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or + 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W') +""") + class FilterException(Exception): """Representation of an error that occurred during filtering. @@ -1111,15 +1119,9 @@ def register_arguments(parser): (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")""" ) metadata_filter_group.add_argument('--min-date', type=numeric_date, - help="""minimal cutoff for date, the cutoff date is inclusive; may be specified as: - 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) - 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') - 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')""") + help=f"""minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""") metadata_filter_group.add_argument('--max-date', type=numeric_date, - help="""maximal cutoff for date, the cutoff date is inclusive; may be specified as: - 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) - 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') - 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')""") + help=f"""maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""") metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").') metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude") @@ -1735,9 +1737,7 @@ def numeric_date(date): except (ValueError, isodate.ISO8601Error): pass - # This currently doesn't get exposed since argparse raises a SystemExit on invalid arguments. - # TODO: find a way to provide better errors for invalid dates. - raise ValueError(f"Unable to determine date from {date}.") + raise argparse.ArgumentTypeError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""") def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True): diff --git a/tests/test_filter.py b/tests/test_filter.py index f2f087579..dc95b8620 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -1,4 +1,5 @@ import argparse +from textwrap import dedent import numpy as np import random import shlex @@ -359,16 +360,22 @@ def test_filter_relative_dates(self, tmpdir, argparser, argparse_params, metadat @freeze_time("2020-03-25") @pytest.mark.parametrize( - "argparse_params", + "argparse_flag, argparse_value", [ - "--max-date 3000Y", - "--max-date invalid", + ("--max-date", "3000Y"), + ("--max-date", "invalid"), ], ) - def test_filter_relative_dates_error(self, tmpdir, argparser, argparse_params): + def test_filter_relative_dates_error(self, tmpdir, argparser, argparse_flag, argparse_value): """Test that invalid dates fail""" out_fn = str(tmpdir / "filtered.txt") meta_fn = write_metadata(tmpdir, (("strain","date"), ("SEQ_1","2020-03-23"))) - with pytest.raises(SystemExit): - argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_params}') + with pytest.raises(SystemExit) as e_info: + argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_flag} {argparse_value}') + assert e_info.value.__context__.message == dedent(f"""\ + Unable to determine date from '{argparse_value}'. Ensure it is in one of the supported formats: + 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or + 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or + 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W') + """) From 29a5a65ec502378ecf3a802b5fc7fa88198dd79c Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 11 Apr 2022 16:15:21 -0700 Subject: [PATCH 5/5] Symmetrically test relative dates for --min-date/--max-date Previously, some tests for --max-date were not done for --min-date and vice-versa. Added tests and re-ordered so that similar tests are adjacent. --- tests/test_filter.py | 80 ++++++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/tests/test_filter.py b/tests/test_filter.py index dc95b8620..1c0f3ed45 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -274,47 +274,47 @@ def test_filter_date_formats(self, tmpdir, fasta_fn, argparser): "argparse_params, metadata_rows, output_sorted_expected", [ ( - "--min-date 1M", + "--min-date 1D", ( - ("SEQ_1","2020-01-25"), - ("SEQ_2","2020-02-25"), + ("SEQ_1","2020-03-23"), + ("SEQ_2","2020-03-24"), ("SEQ_3","2020-03-25"), ), ["SEQ_2", "SEQ_3"], ), ( - "--min-date P1M", + "--max-date 1D", ( - ("SEQ_1","2020-01-25"), - ("SEQ_2","2020-02-25"), + ("SEQ_1","2020-03-23"), + ("SEQ_2","2020-03-24"), ("SEQ_3","2020-03-25"), ), - ["SEQ_2", "SEQ_3"], + ["SEQ_1", "SEQ_2"], ), ( - "--min-date 2Y", + "--min-date 4W", ( - ("SEQ_1","2017-03-25"), - ("SEQ_2","2018-03-25"), - ("SEQ_3","2019-03-25"), + ("SEQ_1","2020-02-25"), + ("SEQ_2","2020-02-26"), + ("SEQ_3","2020-03-25"), ), ["SEQ_2", "SEQ_3"], ), ( - "--min-date 4W", + "--max-date 4W", ( ("SEQ_1","2020-02-25"), ("SEQ_2","2020-02-26"), ("SEQ_3","2020-03-25"), ), - ["SEQ_2", "SEQ_3"], + ["SEQ_1", "SEQ_2"], ), ( - "--min-date 1Y2W5D", + "--min-date 1M", ( - ("SEQ_1","2019-03-05"), - ("SEQ_2","2019-03-06"), - ("SEQ_3","2019-03-07"), + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), ), ["SEQ_2", "SEQ_3"], ), @@ -327,6 +327,15 @@ def test_filter_date_formats(self, tmpdir, fasta_fn, argparser): ), ["SEQ_1", "SEQ_2"], ), + ( + "--min-date P1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), ( "--max-date P1M", ( @@ -337,11 +346,38 @@ def test_filter_date_formats(self, tmpdir, fasta_fn, argparser): ["SEQ_1", "SEQ_2"], ), ( - "--max-date 1D", + "--min-date 2Y", ( - ("SEQ_1","2020-03-23"), - ("SEQ_2","2020-03-24"), - ("SEQ_3","2020-03-25"), + ("SEQ_1","2017-03-25"), + ("SEQ_2","2018-03-25"), + ("SEQ_3","2019-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date 2Y", + ( + ("SEQ_1","2017-03-25"), + ("SEQ_2","2018-03-25"), + ("SEQ_3","2019-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ( + "--min-date 1Y2W5D", + ( + ("SEQ_1","2019-03-05"), + ("SEQ_2","2019-03-06"), + ("SEQ_3","2019-03-07"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date 1Y2W5D", + ( + ("SEQ_1","2019-03-05"), + ("SEQ_2","2019-03-06"), + ("SEQ_3","2019-03-07"), ), ["SEQ_1", "SEQ_2"], ), @@ -362,7 +398,9 @@ def test_filter_relative_dates(self, tmpdir, argparser, argparse_params, metadat @pytest.mark.parametrize( "argparse_flag, argparse_value", [ + ("--min-date", "3000Y"), ("--max-date", "3000Y"), + ("--min-date", "invalid"), ("--max-date", "invalid"), ], )