Skip to content

Commit

Permalink
Use stacked ambiguous date checking
Browse files Browse the repository at this point in the history
Previously, the ambiguous date checks were 1:1 with the generated date
columns. This should not be the case since month needs to check for
ambiguous year, and week needs to check for anything that is ambiguous.

Separate the ambiguous date checking from the column generation, and
update the conditions for the former to be "stacking" (i.e. year is
always checked, month is checked for month/week, and day is checked for
week only).
  • Loading branch information
victorlin committed Oct 25, 2022
1 parent 61aeda6 commit fd88c61
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 16 deletions.
29 changes: 14 additions & 15 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,8 +1049,10 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
# Extend metadata with generated date columns
# Drop the 'date' column since it should not be used for grouping.
metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1)
if 'year' in generated_columns_requested:
# Skip ambiguous years.

# Skip ambiguous dates.
if True:
# Skip ambiguous years (always, since generated columns are requested).
df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
metadata.drop(df_skip.index, inplace=True)
for strain in df_skip.index:
Expand All @@ -1059,11 +1061,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
"filter": "skip_group_by_with_ambiguous_year",
"kwargs": "",
})

# Make a generated 'year' column available for grouping.
metadata['year'] = metadata[f'{temp_prefix}year']

if 'month' in generated_columns_requested:
if 'month' in generated_columns_requested or 'week' in generated_columns_requested:
# Skip ambiguous months.
df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
metadata.drop(df_skip.index, inplace=True)
Expand All @@ -1073,13 +1071,6 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
"filter": "skip_group_by_with_ambiguous_month",
"kwargs": "",
})

# Make a generated 'month' column available for grouping.
metadata['month'] = list(zip(
metadata[f'{temp_prefix}year'],
metadata[f'{temp_prefix}month']
))

if 'week' in generated_columns_requested:
# Skip ambiguous days.
df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
Expand All @@ -1091,7 +1082,15 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
"kwargs": "",
})

# Make a generated 'week' column available for grouping.
# Generate columns.
if 'year' in generated_columns_requested:
metadata['year'] = metadata[f'{temp_prefix}year']
if 'month' in generated_columns_requested:
metadata['month'] = list(zip(
metadata[f'{temp_prefix}year'],
metadata[f'{temp_prefix}month']
))
if 'week' in generated_columns_requested:
# Note that week = (year, week) from the date.isocalendar().
# Do not combine the raw year with the ISO week number alone,
# since raw year ≠ ISO year.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,9 @@ Group by 'week'. Check the number of strains that have been dropped due to ambig
> --subsample-seed 0 \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-log "$TMP/filtered_log.tsv" > /dev/null
$ grep "skip_group_by_with_ambiguous_year" "$TMP/filtered_log.tsv" | wc -l
\s*1 (re)
$ grep "skip_group_by_with_ambiguous_month" "$TMP/filtered_log.tsv" | wc -l
\s*1 (re)
$ grep "skip_group_by_with_ambiguous_day" "$TMP/filtered_log.tsv" | wc -l
\s*5 (re)
\s*3 (re)

0 comments on commit fd88c61

Please sign in to comment.