From fd88c6106a0c01a54e8f2b264bdbde6487a97677 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 24 Oct 2022 15:07:26 -0700 Subject: [PATCH] Use stacked ambiguous date checking Previously, the ambiguous date checks were 1:1 with the generated date columns. This should not be the case since month needs to check for ambiguous year, and week needs to check for anything that is ambiguous. Separate the ambiguous date checking from the column generation, and update the conditions for the former to be "stacking" (i.e. year is always checked, month is checked for month/week, and day is checked for week only). --- augur/filter.py | 29 +++++++++---------- .../cram/subsample-skip-ambiguous-dates.t | 6 +++- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index 6f03746b3..cef6f6852 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -1049,8 +1049,10 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): # Extend metadata with generated date columns # Drop the 'date' column since it should not be used for grouping. metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1) - if 'year' in generated_columns_requested: - # Skip ambiguous years. + + # Skip ambiguous dates. + if True: + # Skip ambiguous years (always, since generated columns are requested). df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()] metadata.drop(df_skip.index, inplace=True) for strain in df_skip.index: @@ -1059,11 +1061,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): "filter": "skip_group_by_with_ambiguous_year", "kwargs": "", }) - - # Make a generated 'year' column available for grouping. - metadata['year'] = metadata[f'{temp_prefix}year'] - - if 'month' in generated_columns_requested: + if 'month' in generated_columns_requested or 'week' in generated_columns_requested: # Skip ambiguous months. df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()] metadata.drop(df_skip.index, inplace=True) @@ -1073,13 +1071,6 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): "filter": "skip_group_by_with_ambiguous_month", "kwargs": "", }) - - # Make a generated 'month' column available for grouping. - metadata['month'] = list(zip( - metadata[f'{temp_prefix}year'], - metadata[f'{temp_prefix}month'] - )) - if 'week' in generated_columns_requested: # Skip ambiguous days. df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()] @@ -1091,7 +1082,15 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): "kwargs": "", }) - # Make a generated 'week' column available for grouping. + # Generate columns. + if 'year' in generated_columns_requested: + metadata['year'] = metadata[f'{temp_prefix}year'] + if 'month' in generated_columns_requested: + metadata['month'] = list(zip( + metadata[f'{temp_prefix}year'], + metadata[f'{temp_prefix}month'] + )) + if 'week' in generated_columns_requested: # Note that week = (year, week) from the date.isocalendar(). # Do not combine the raw year with the ISO week number alone, # since raw year ≠ ISO year. diff --git a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t index 139796ae7..67dbf3d3f 100644 --- a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t +++ b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t @@ -38,5 +38,9 @@ Group by 'week'. Check the number of strains that have been dropped due to ambig > --subsample-seed 0 \ > --output-strains "$TMP/filtered_strains.txt" \ > --output-log "$TMP/filtered_log.tsv" > /dev/null + $ grep "skip_group_by_with_ambiguous_year" "$TMP/filtered_log.tsv" | wc -l + \s*1 (re) + $ grep "skip_group_by_with_ambiguous_month" "$TMP/filtered_log.tsv" | wc -l + \s*1 (re) $ grep "skip_group_by_with_ambiguous_day" "$TMP/filtered_log.tsv" | wc -l - \s*5 (re) + \s*3 (re)