Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

filter: Support relative dates for --min-date and --max-date #740

Merged
merged 5 commits into from
Apr 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions augur/filter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""
Filter and subsample a sequence set.
"""
import argparse
from Bio import SeqIO
from collections import defaultdict
import csv
import datetime
import heapq
import isodate
import itertools
import json
import numpy as np
Expand All @@ -17,6 +19,7 @@
import sys
from tempfile import NamedTemporaryFile
import treetime.utils
from textwrap import dedent
from typing import Collection

from .index import index_sequences, index_vcf
Expand All @@ -30,6 +33,12 @@
"non_nucleotide",
)

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")


class FilterException(Exception):
"""Representation of an error that occurred during filtering.
Expand Down Expand Up @@ -1109,8 +1118,10 @@ def register_arguments(parser):
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--min-date', type=numeric_date,
help=f"""minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""")
metadata_filter_group.add_argument('--max-date', type=numeric_date,
help=f"""maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
Expand Down Expand Up @@ -1689,18 +1700,44 @@ def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.

*date* may be given as a number (a float) with year as the integer part, or
in the YYYY-MM-DD (ISO 8601) syntax.
*date* may be given as:
1. A string or float (number) with year as the integer part
2. A string in the YYYY-MM-DD (ISO 8601) syntax
3. A string representing a relative date (duration before datetime.date.today())

>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
>>> import datetime, isodate, treetime
>>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W"))
True
"""
# date is numeric
try:
return float(date)
except ValueError:
pass

# date is in YYYY-MM-DD form
try:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
except ValueError:
pass

# date is a duration treated as a backwards-looking relative date
try:
# make a copy of date for this block
duration_str = str(date)
if duration_str.startswith('P'):
duration_str = duration_str
else:
duration_str = 'P'+duration_str
return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str))
except (ValueError, isodate.ISO8601Error):
pass

raise argparse.ArgumentTypeError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""")


def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
install_requires = [
"bcbio-gff >=0.6.0, ==0.6.*",
"biopython >=1.67, !=1.77, !=1.78",
"isodate ==0.6.*",
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"jsonschema >=3.0.0, ==3.*",
"networkx >= 2.5, ==2.*",
"packaging >=19.2",
Expand Down
152 changes: 152 additions & 0 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from textwrap import dedent
import numpy as np
import random
import shlex
Expand All @@ -9,6 +10,8 @@
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from freezegun import freeze_time

import augur.filter
from augur.utils import read_metadata

Expand Down Expand Up @@ -265,3 +268,152 @@ def test_filter_date_formats(self, tmpdir, fasta_fn, argparser):
augur.filter.run(args)
output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
assert list(output.keys()) == ["SEQ_1", "SEQ_2", "SEQ_3"]

@freeze_time("2020-03-25")
@pytest.mark.parametrize(
"argparse_params, metadata_rows, output_sorted_expected",
[
(
"--min-date 1D",
(
("SEQ_1","2020-03-23"),
("SEQ_2","2020-03-24"),
("SEQ_3","2020-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 1D",
(
("SEQ_1","2020-03-23"),
("SEQ_2","2020-03-24"),
("SEQ_3","2020-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date 4W",
(
("SEQ_1","2020-02-25"),
("SEQ_2","2020-02-26"),
("SEQ_3","2020-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 4W",
(
("SEQ_1","2020-02-25"),
("SEQ_2","2020-02-26"),
("SEQ_3","2020-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date 1M",
(
("SEQ_1","2020-01-25"),
("SEQ_2","2020-02-25"),
("SEQ_3","2020-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 1M",
(
("SEQ_1","2020-01-25"),
("SEQ_2","2020-02-25"),
("SEQ_3","2020-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date P1M",
(
("SEQ_1","2020-01-25"),
("SEQ_2","2020-02-25"),
("SEQ_3","2020-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date P1M",
(
("SEQ_1","2020-01-25"),
("SEQ_2","2020-02-25"),
("SEQ_3","2020-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date 2Y",
(
("SEQ_1","2017-03-25"),
("SEQ_2","2018-03-25"),
("SEQ_3","2019-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 2Y",
(
("SEQ_1","2017-03-25"),
("SEQ_2","2018-03-25"),
("SEQ_3","2019-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date 1Y2W5D",
(
("SEQ_1","2019-03-05"),
("SEQ_2","2019-03-06"),
("SEQ_3","2019-03-07"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 1Y2W5D",
(
("SEQ_1","2019-03-05"),
("SEQ_2","2019-03-06"),
("SEQ_3","2019-03-07"),
),
["SEQ_1", "SEQ_2"],
),
],
)
def test_filter_relative_dates(self, tmpdir, argparser, argparse_params, metadata_rows, output_sorted_expected):
"""Test that various relative dates work"""
out_fn = str(tmpdir / "filtered.txt")
meta_fn = write_metadata(tmpdir, (("strain","date"),
*metadata_rows))
args = argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_params}')
augur.filter.run(args)
with open(out_fn) as f:
output_sorted = sorted(line.rstrip() for line in f)
assert output_sorted == output_sorted_expected

@freeze_time("2020-03-25")
@pytest.mark.parametrize(
"argparse_flag, argparse_value",
[
("--min-date", "3000Y"),
("--max-date", "3000Y"),
("--min-date", "invalid"),
("--max-date", "invalid"),
],
)
def test_filter_relative_dates_error(self, tmpdir, argparser, argparse_flag, argparse_value):
"""Test that invalid dates fail"""
out_fn = str(tmpdir / "filtered.txt")
meta_fn = write_metadata(tmpdir, (("strain","date"),
("SEQ_1","2020-03-23")))
with pytest.raises(SystemExit) as e_info:
argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_flag} {argparse_value}')
assert e_info.value.__context__.message == dedent(f"""\
Unable to determine date from '{argparse_value}'. Ensure it is in one of the supported formats:
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")