diff --git a/augur/filter.py b/augur/filter.py index 197786168..65e834704 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -1,11 +1,13 @@ """ Filter and subsample a sequence set. """ +import argparse from Bio import SeqIO from collections import defaultdict import csv import datetime import heapq +import isodate import itertools import json import numpy as np @@ -17,6 +19,7 @@ import sys from tempfile import NamedTemporaryFile import treetime.utils +from textwrap import dedent from typing import Collection from .index import index_sequences, index_vcf @@ -30,6 +33,12 @@ "non_nucleotide", ) +SUPPORTED_DATE_HELP_TEXT = dedent("""\ + 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or + 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or + 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W') +""") + class FilterException(Exception): """Representation of an error that occurred during filtering. @@ -1109,8 +1118,10 @@ def register_arguments(parser): Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax. (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")""" ) - metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") - metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") + metadata_filter_group.add_argument('--min-date', type=numeric_date, + help=f"""minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""") + metadata_filter_group.add_argument('--max-date', type=numeric_date, + help=f"""maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""") metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").') metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude") @@ -1689,18 +1700,44 @@ def numeric_date(date): """ Converts the given *date* string to a :py:class:`float`. - *date* may be given as a number (a float) with year as the integer part, or - in the YYYY-MM-DD (ISO 8601) syntax. + *date* may be given as: + 1. A string or float (number) with year as the integer part + 2. A string in the YYYY-MM-DD (ISO 8601) syntax + 3. A string representing a relative date (duration before datetime.date.today()) >>> numeric_date("2020.42") 2020.42 >>> numeric_date("2020-06-04") 2020.42486... + >>> import datetime, isodate, treetime + >>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W")) + True """ + # date is numeric try: return float(date) except ValueError: + pass + + # date is in YYYY-MM-DD form + try: return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2)))) + except ValueError: + pass + + # date is a duration treated as a backwards-looking relative date + try: + # make a copy of date for this block + duration_str = str(date) + if duration_str.startswith('P'): + duration_str = duration_str + else: + duration_str = 'P'+duration_str + return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str)) + except (ValueError, isodate.ISO8601Error): + pass + + raise argparse.ArgumentTypeError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""") def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True): diff --git a/setup.py b/setup.py index 907168ae7..f5eda48a4 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ install_requires = [ "bcbio-gff >=0.6.0, ==0.6.*", "biopython >=1.67, !=1.77, !=1.78", + "isodate ==0.6.*", "jsonschema >=3.0.0, ==3.*", "networkx >= 2.5, ==2.*", "packaging >=19.2", diff --git a/tests/test_filter.py b/tests/test_filter.py index 68131f193..1c0f3ed45 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -1,4 +1,5 @@ import argparse +from textwrap import dedent import numpy as np import random import shlex @@ -9,6 +10,8 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from freezegun import freeze_time + import augur.filter from augur.utils import read_metadata @@ -265,3 +268,152 @@ def test_filter_date_formats(self, tmpdir, fasta_fn, argparser): augur.filter.run(args) output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta")) assert list(output.keys()) == ["SEQ_1", "SEQ_2", "SEQ_3"] + + @freeze_time("2020-03-25") + @pytest.mark.parametrize( + "argparse_params, metadata_rows, output_sorted_expected", + [ + ( + "--min-date 1D", + ( + ("SEQ_1","2020-03-23"), + ("SEQ_2","2020-03-24"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date 1D", + ( + ("SEQ_1","2020-03-23"), + ("SEQ_2","2020-03-24"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ( + "--min-date 4W", + ( + ("SEQ_1","2020-02-25"), + ("SEQ_2","2020-02-26"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date 4W", + ( + ("SEQ_1","2020-02-25"), + ("SEQ_2","2020-02-26"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ( + "--min-date 1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date 1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ( + "--min-date P1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date P1M", + ( + ("SEQ_1","2020-01-25"), + ("SEQ_2","2020-02-25"), + ("SEQ_3","2020-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ( + "--min-date 2Y", + ( + ("SEQ_1","2017-03-25"), + ("SEQ_2","2018-03-25"), + ("SEQ_3","2019-03-25"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date 2Y", + ( + ("SEQ_1","2017-03-25"), + ("SEQ_2","2018-03-25"), + ("SEQ_3","2019-03-25"), + ), + ["SEQ_1", "SEQ_2"], + ), + ( + "--min-date 1Y2W5D", + ( + ("SEQ_1","2019-03-05"), + ("SEQ_2","2019-03-06"), + ("SEQ_3","2019-03-07"), + ), + ["SEQ_2", "SEQ_3"], + ), + ( + "--max-date 1Y2W5D", + ( + ("SEQ_1","2019-03-05"), + ("SEQ_2","2019-03-06"), + ("SEQ_3","2019-03-07"), + ), + ["SEQ_1", "SEQ_2"], + ), + ], + ) + def test_filter_relative_dates(self, tmpdir, argparser, argparse_params, metadata_rows, output_sorted_expected): + """Test that various relative dates work""" + out_fn = str(tmpdir / "filtered.txt") + meta_fn = write_metadata(tmpdir, (("strain","date"), + *metadata_rows)) + args = argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_params}') + augur.filter.run(args) + with open(out_fn) as f: + output_sorted = sorted(line.rstrip() for line in f) + assert output_sorted == output_sorted_expected + + @freeze_time("2020-03-25") + @pytest.mark.parametrize( + "argparse_flag, argparse_value", + [ + ("--min-date", "3000Y"), + ("--max-date", "3000Y"), + ("--min-date", "invalid"), + ("--max-date", "invalid"), + ], + ) + def test_filter_relative_dates_error(self, tmpdir, argparser, argparse_flag, argparse_value): + """Test that invalid dates fail""" + out_fn = str(tmpdir / "filtered.txt") + meta_fn = write_metadata(tmpdir, (("strain","date"), + ("SEQ_1","2020-03-23"))) + with pytest.raises(SystemExit) as e_info: + argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_flag} {argparse_value}') + assert e_info.value.__context__.message == dedent(f"""\ + Unable to determine date from '{argparse_value}'. Ensure it is in one of the supported formats: + 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or + 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or + 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W') + """)