Merge pull request #1072: Use stacked ambiguous date checking

nextstrain · Oct 26, 2022 · 2891ef0 · 2891ef0
2 parents e71360b + 0a8760c
commit 2891ef0
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 36 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -9,9 +9,11 @@
 ### Bug Fixes
 
 * filter: Fixed unintended behavior in which grouping by `day` would "work" when used with `month` and/or `year`. Updated so it will be ignored. [#1070][] (@victorlin)
+* filter: Fixed unintended behavior in which grouping by `month` with ambiguous years would "work". Updated so date ambiguity is checked properly for all generated columns. [#1072][] (@victorlin)
 
 [#1067]: https://github.com/nextstrain/augur/pull/1067
 [#1070]: https://github.com/nextstrain/augur/pull/1070
+[#1072]: https://github.com/nextstrain/augur/pull/1072
 
 ## 18.0.0 (21 September 2022)
 

diff --git a/augur/filter.py b/augur/filter.py
@@ -1049,49 +1049,24 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
             # Extend metadata with generated date columns
             # Drop the 'date' column since it should not be used for grouping.
             metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1)
+
+            ambiguous_date_strains = list(_get_ambiguous_date_skipped_strains(
+                metadata,
+                temp_prefix,
+                generated_columns_requested
+            ))
+            metadata.drop([record['strain'] for record in ambiguous_date_strains], inplace=True)
+            skipped_strains.extend(ambiguous_date_strains)
+
+            # Generate columns.
             if 'year' in generated_columns_requested:
-                # Skip ambiguous years.
-                df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
-                metadata.dropna(subset=[f'{temp_prefix}year'], inplace=True)
-                for strain in df_skip.index:
-                    skipped_strains.append({
-                        "strain": strain,
-                        "filter": "skip_group_by_with_ambiguous_year",
-                        "kwargs": "",
-                    })
-
-                # Make a generated 'year' column available for grouping.
                 metadata['year'] = metadata[f'{temp_prefix}year']
-
             if 'month' in generated_columns_requested:
-                # Skip ambiguous months.
-                df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
-                metadata.dropna(subset=[f'{temp_prefix}month'], inplace=True)
-                for strain in df_skip.index:
-                    skipped_strains.append({
-                        "strain": strain,
-                        "filter": "skip_group_by_with_ambiguous_month",
-                        "kwargs": "",
-                    })
-
-                # Make a generated 'month' column available for grouping.
                 metadata['month'] = list(zip(
                     metadata[f'{temp_prefix}year'],
                     metadata[f'{temp_prefix}month']
                 ))
-
             if 'week' in generated_columns_requested:
-                # Skip ambiguous days.
-                df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
-                metadata.dropna(subset=[f'{temp_prefix}day'], inplace=True)
-                for strain in df_skip.index:
-                    skipped_strains.append({
-                        "strain": strain,
-                        "filter": "skip_group_by_with_ambiguous_day",
-                        "kwargs": "",
-                    })
-
-                # Make a generated 'week' column available for grouping.
                 # Note that week = (year, week) from the date.isocalendar().
                 # Do not combine the raw year with the ISO week number alone,
                 # since raw year ≠ ISO year.
@@ -1118,6 +1093,54 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
     return group_by_strain, skipped_strains
 
 
+def _get_ambiguous_date_skipped_strains(
+        metadata, temp_prefix, generated_columns_requested):
+    """Get strains skipped due to date ambiguity.
+
+    Each value is a dictionary with keys:
+    - `strain`: strain name
+    - `filter`: filter reason. Used for the final report output.
+    - `kwargs`: Empty string since filter reason does not represent a function.
+    """
+    # Don't yield the same strain twice.
+    already_skipped_strains = set()
+
+    if generated_columns_requested:
+        # Skip ambiguous years.
+        df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
+        for strain in df_skip.index:
+            if strain not in already_skipped_strains:
+                yield {
+                    "strain": strain,
+                    "filter": "skip_group_by_with_ambiguous_year",
+                    "kwargs": "",
+                }
+        already_skipped_strains.update(df_skip.index)
+    if 'month' in generated_columns_requested or 'week' in generated_columns_requested:
+        # Skip ambiguous months.
+        df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
+        for strain in df_skip.index:
+            if strain not in already_skipped_strains:
+                yield {
+                    "strain": strain,
+                    "filter": "skip_group_by_with_ambiguous_month",
+                    "kwargs": "",
+                }
+        already_skipped_strains.update(df_skip.index)
+    if 'week' in generated_columns_requested:
+        # Skip ambiguous days.
+        df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
+        for strain in df_skip.index:
+            if strain not in already_skipped_strains:
+                yield {
+                    "strain": strain,
+                    "filter": "skip_group_by_with_ambiguous_day",
+                    "kwargs": "",
+                }
+        # TODO: uncomment if another filter reason is ever added.
+        # already_skipped_strains.update(df_skip.index)
+
+
 class PriorityQueue:
     """A priority queue implementation that automatically replaces lower priority
     items in the heap with incoming higher priority items.

diff --git a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t
@@ -38,5 +38,9 @@ Group by 'week'. Check the number of strains that have been dropped due to ambig
   >  --subsample-seed 0 \
   >  --output-strains "$TMP/filtered_strains.txt" \
   >  --output-log "$TMP/filtered_log.tsv" > /dev/null
+  $ grep "skip_group_by_with_ambiguous_year" "$TMP/filtered_log.tsv" | wc -l
+  \s*1 (re)
+  $ grep "skip_group_by_with_ambiguous_month" "$TMP/filtered_log.tsv" | wc -l
+  \s*1 (re)
   $ grep "skip_group_by_with_ambiguous_day" "$TMP/filtered_log.tsv" | wc -l
-  \s*5 (re)
+  \s*3 (re)