Skip to content

Commit

Permalink
Merge pull request #623 from saikirankv/master
Browse files Browse the repository at this point in the history
Excluding ambiguous date in filter command
  • Loading branch information
huddlej authored Nov 12, 2020
2 parents b2ae15f + d7b7d2f commit 748abc8
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 3 deletions.
24 changes: 22 additions & 2 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import sys
import datetime
import treetime.utils
from .utils import read_metadata, get_numerical_dates, run_shell_command, shquote
from .utils import read_metadata, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous

comment_char = '#'

Expand Down Expand Up @@ -105,6 +105,8 @@ def register_arguments(parser):
help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
parser.add_argument('--include-where', nargs='+',
help="Include samples with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be included), not AND. This rule is applied last and ensures any sequences matching these rules will be included.")
parser.add_argument('--exclude-ambiguous-dates-by', choices=['all', 'day', 'month', 'year'],
help='Exclude ambiguous dates Ex: days - excludes 2020-09-XX, months - excludes 2020-xx-19, all - excludes any ambiguous dates')
parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.")
parser.add_argument('--output', '-o', help="output file", required=True)

Expand Down Expand Up @@ -231,10 +233,26 @@ def run(args):
num_excluded_by_length = len(seq_keep) - len(seq_keep_by_length)
seq_keep = seq_keep_by_length

# filter by ambiguous dates
num_excluded_by_ambiguous_date = 0
if args.exclude_ambiguous_dates_by and 'date' in meta_columns:
seq_keep_by_date = []
for seq_name in seq_keep:
if is_date_ambiguous(meta_dict[seq_name]['date'],args.exclude_ambiguous_dates_by) is False:
seq_keep_by_date.append(seq_name)
num_excluded_by_ambiguous_date = len(seq_keep) - len(seq_keep_by_date)
seq_keep = seq_keep_by_date

# filter by date
num_excluded_by_date = 0
if (args.min_date or args.max_date) and 'date' in meta_columns:
dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d")
if num_excluded_by_ambiguous_date:
date_meta_dict = {}
for seq_name in seq_keep:
date_meta_dict[seq_name]=meta_dict[seq_name]
dates = get_numerical_dates(date_meta_dict, fmt="%Y-%m-%d")
else:
dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d")
tmp = [s for s in seq_keep if dates[s] is not None]
if args.min_date:
tmp = [s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date]
Expand Down Expand Up @@ -423,6 +441,8 @@ def run(args):
print("\t%i of these were filtered out by the query:\n\t\t\"%s\"" % (num_excluded_by_query, args.query))
if args.min_length:
print("\t%i of these were dropped because they were shorter than minimum length of %sbp" % (num_excluded_by_length, args.min_length))
if args.exclude_ambiguous_dates_by and num_excluded_by_ambiguous_date:
print("\t%i of these were dropped because of their ambiguous date in %s" % (num_excluded_by_ambiguous_date, args.exclude_ambiguous_dates_by))
if (args.min_date or args.max_date) and 'date' in meta_columns:
print("\t%i of these were dropped because of their date (or lack of date)" % (num_excluded_by_date))
if args.non_nucleotide:
Expand Down
23 changes: 22 additions & 1 deletion augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,27 @@ def ambiguous_date_to_date_range(uncertain_date, fmt, min_max_year=None):
def read_metadata(fname, query=None):
return MetadataFile(fname, query).read()

def is_date_ambiguous(date, ambiguous_by="all"):
"""
Returns whether a given date string in the format of YYYY-MM-DD is ambiguous by a given part of the date (e.g., day, month, year, or all parts).
Parameters
----------
date : str
Date string in the format of YYYY-MM-DD
ambiguous_by : str
Field of the date string to test for ambiguity ("day", "month", "year", "all")
"""
year, month, day = date.split('-')
return (
(ambiguous_by == 'all' and 'X' in date) or
(ambiguous_by == 'day' and 'X' in day) or
(ambiguous_by == 'month' and 'X' in month) or
(ambiguous_by == 'year' and 'X' in year)
)

def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, min_max_year=None):
num_excluded_recs = 0
if fmt:
from datetime import datetime
numerical_dates = {}
Expand All @@ -95,7 +115,8 @@ def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, m
numerical_dates[k] = None
else:
numerical_dates = {k:float(v) for k,v in meta_dict.items()}

if num_excluded_recs:
print("%s records were excluded due to ambiguous date"%num_excluded_recs)
return numerical_dates


Expand Down
14 changes: 14 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,17 @@ def test_read_mask_file_drm_file(self, tmpdir):
with open(drm_file, "w") as fh:
fh.write("\n".join(drm_lines))
assert utils.read_mask_file(drm_file) == expected_sites

def test_is_date_ambiguous(self):
""" is_date_ambiguous should return true for ambiguous dates"""
assert utils.is_date_ambiguous("2019-0X-0X", "all")
assert utils.is_date_ambiguous("2019-XX-09", "month")
assert utils.is_date_ambiguous("2019-03-XX", "day")
assert utils.is_date_ambiguous("201X-03-09", "year")

def test_not_is_date_ambiguous(self):
""" is_date_ambiguous should return false for valid dates"""
assert utils.is_date_ambiguous("2019-09-03", "all") is False
assert utils.is_date_ambiguous("2019-03-XX", "month") is False
assert utils.is_date_ambiguous("2019-XX-01", "day") is False
assert utils.is_date_ambiguous("2019-XX-XX", "year") is False

0 comments on commit 748abc8

Please sign in to comment.