Skip to content

Commit

Permalink
Merge pull request #1072: Use stacked ambiguous date checking
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin authored Oct 26, 2022
2 parents e71360b + 0a8760c commit 2891ef0
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 36 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
### Bug Fixes

* filter: Fixed unintended behavior in which grouping by `day` would "work" when used with `month` and/or `year`. Updated so it will be ignored. [#1070][] (@victorlin)
* filter: Fixed unintended behavior in which grouping by `month` with ambiguous years would "work". Updated so date ambiguity is checked properly for all generated columns. [#1072][] (@victorlin)

[#1067]: https://github.com/nextstrain/augur/pull/1067
[#1070]: https://github.com/nextstrain/augur/pull/1070
[#1072]: https://github.com/nextstrain/augur/pull/1072

## 18.0.0 (21 September 2022)

Expand Down
93 changes: 58 additions & 35 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,49 +1049,24 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
# Extend metadata with generated date columns
# Drop the 'date' column since it should not be used for grouping.
metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1)

ambiguous_date_strains = list(_get_ambiguous_date_skipped_strains(
metadata,
temp_prefix,
generated_columns_requested
))
metadata.drop([record['strain'] for record in ambiguous_date_strains], inplace=True)
skipped_strains.extend(ambiguous_date_strains)

# Generate columns.
if 'year' in generated_columns_requested:
# Skip ambiguous years.
df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
metadata.dropna(subset=[f'{temp_prefix}year'], inplace=True)
for strain in df_skip.index:
skipped_strains.append({
"strain": strain,
"filter": "skip_group_by_with_ambiguous_year",
"kwargs": "",
})

# Make a generated 'year' column available for grouping.
metadata['year'] = metadata[f'{temp_prefix}year']

if 'month' in generated_columns_requested:
# Skip ambiguous months.
df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
metadata.dropna(subset=[f'{temp_prefix}month'], inplace=True)
for strain in df_skip.index:
skipped_strains.append({
"strain": strain,
"filter": "skip_group_by_with_ambiguous_month",
"kwargs": "",
})

# Make a generated 'month' column available for grouping.
metadata['month'] = list(zip(
metadata[f'{temp_prefix}year'],
metadata[f'{temp_prefix}month']
))

if 'week' in generated_columns_requested:
# Skip ambiguous days.
df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
metadata.dropna(subset=[f'{temp_prefix}day'], inplace=True)
for strain in df_skip.index:
skipped_strains.append({
"strain": strain,
"filter": "skip_group_by_with_ambiguous_day",
"kwargs": "",
})

# Make a generated 'week' column available for grouping.
# Note that week = (year, week) from the date.isocalendar().
# Do not combine the raw year with the ISO week number alone,
# since raw year ≠ ISO year.
Expand All @@ -1118,6 +1093,54 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
return group_by_strain, skipped_strains


def _get_ambiguous_date_skipped_strains(
metadata, temp_prefix, generated_columns_requested):
"""Get strains skipped due to date ambiguity.
Each value is a dictionary with keys:
- `strain`: strain name
- `filter`: filter reason. Used for the final report output.
- `kwargs`: Empty string since filter reason does not represent a function.
"""
# Don't yield the same strain twice.
already_skipped_strains = set()

if generated_columns_requested:
# Skip ambiguous years.
df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
for strain in df_skip.index:
if strain not in already_skipped_strains:
yield {
"strain": strain,
"filter": "skip_group_by_with_ambiguous_year",
"kwargs": "",
}
already_skipped_strains.update(df_skip.index)
if 'month' in generated_columns_requested or 'week' in generated_columns_requested:
# Skip ambiguous months.
df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
for strain in df_skip.index:
if strain not in already_skipped_strains:
yield {
"strain": strain,
"filter": "skip_group_by_with_ambiguous_month",
"kwargs": "",
}
already_skipped_strains.update(df_skip.index)
if 'week' in generated_columns_requested:
# Skip ambiguous days.
df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
for strain in df_skip.index:
if strain not in already_skipped_strains:
yield {
"strain": strain,
"filter": "skip_group_by_with_ambiguous_day",
"kwargs": "",
}
# TODO: uncomment if another filter reason is ever added.
# already_skipped_strains.update(df_skip.index)


class PriorityQueue:
"""A priority queue implementation that automatically replaces lower priority
items in the heap with incoming higher priority items.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,9 @@ Group by 'week'. Check the number of strains that have been dropped due to ambig
> --subsample-seed 0 \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-log "$TMP/filtered_log.tsv" > /dev/null
$ grep "skip_group_by_with_ambiguous_year" "$TMP/filtered_log.tsv" | wc -l
\s*1 (re)
$ grep "skip_group_by_with_ambiguous_month" "$TMP/filtered_log.tsv" | wc -l
\s*1 (re)
$ grep "skip_group_by_with_ambiguous_day" "$TMP/filtered_log.tsv" | wc -l
\s*5 (re)
\s*3 (re)

0 comments on commit 2891ef0

Please sign in to comment.