filter: Use intermediate columns for grouping

Previously: 1. year/month/day columns were created on the DataFrame used for grouping. 2. month was overwritten with (year, month). This allowed year to be used for grouping, but also the same for day which was an unintended side effect. Instead of adding columns with those names, use names with a more distinct prefix that can be safely discarded before grouping happens.
nextstrain · Oct 24, 2022 · ea4c7cd · ea4c7cd
1 parent 181f48a
commit ea4c7cd
Showing 1 changed file with 41 additions and 14 deletions.
diff --git a/augur/filter.py b/augur/filter.py
@@ -1030,52 +1030,78 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
             df_dates = pd.DataFrame({col: 'unknown' for col in GROUP_BY_GENERATED_COLUMNS}, index=metadata.index)
             metadata = pd.concat([metadata, df_dates], axis=1)
         else:
-            # Replace date with year/month/day as nullable ints.
-            date_cols = ['year', 'month', 'day']
+            # Create a DataFrame with year/month/day columns as nullable ints.
+            # These columns are prefixed to note temporary usage. They are used
+            # to generate other columns, and will be discarded at the end.
+            temp_prefix = '_augur_filter_'
+            temp_date_cols = [f'{temp_prefix}year', f'{temp_prefix}month', f'{temp_prefix}day']
             df_dates = metadata['date'].str.split('-', n=2, expand=True)
-            df_dates = df_dates.set_axis(date_cols[:len(df_dates.columns)], axis=1)
-            missing_date_cols = set(date_cols) - set(df_dates.columns)
+            df_dates = df_dates.set_axis(temp_date_cols[:len(df_dates.columns)], axis=1)
+            missing_date_cols = set(temp_date_cols) - set(df_dates.columns)
             for col in missing_date_cols:
                 df_dates[col] = pd.NA
-            for col in date_cols:
+            for col in temp_date_cols:
                 df_dates[col] = pd.to_numeric(df_dates[col], errors='coerce').astype(pd.Int64Dtype())
+
+            # Extend metadata with generated date columns
+            # Drop the 'date' column since it should not be used for grouping.
             metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1)
             if 'year' in generated_columns_requested:
                 # Skip ambiguous years.
-                df_skip = metadata[metadata['year'].isnull()]
-                metadata.dropna(subset=['year'], inplace=True)
+                df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
+                metadata.dropna(subset=[f'{temp_prefix}year'], inplace=True)
                 for strain in df_skip.index:
                     skipped_strains.append({
                         "strain": strain,
                         "filter": "skip_group_by_with_ambiguous_year",
                         "kwargs": "",
                     })
+
+                # Make a generated 'year' column available for grouping.
+                metadata['year'] = metadata[f'{temp_prefix}year']
+
             if 'month' in generated_columns_requested:
                 # Skip ambiguous months.
-                df_skip = metadata[metadata['month'].isnull()]
-                metadata.dropna(subset=['month'], inplace=True)
+                df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
+                metadata.dropna(subset=[f'{temp_prefix}month'], inplace=True)
                 for strain in df_skip.index:
                     skipped_strains.append({
                         "strain": strain,
                         "filter": "skip_group_by_with_ambiguous_month",
                         "kwargs": "",
                     })
-                # month = (year, month)
-                metadata['month'] = list(zip(metadata['year'], metadata['month']))
+
+                # Make a generated 'month' column available for grouping.
+                metadata['month'] = list(zip(
+                    metadata[f'{temp_prefix}year'],
+                    metadata[f'{temp_prefix}month']
+                ))
+
             if 'week' in generated_columns_requested:
                 # Skip ambiguous days.
-                df_skip = metadata[metadata['day'].isnull()]
-                metadata.dropna(subset=['day'], inplace=True)
+                df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
+                metadata.dropna(subset=[f'{temp_prefix}day'], inplace=True)
                 for strain in df_skip.index:
                     skipped_strains.append({
                         "strain": strain,
                         "filter": "skip_group_by_with_ambiguous_day",
                         "kwargs": "",
                     })
+
+                # Make a generated 'week' column available for grouping.
                 # Note that week = (year, week) from the date.isocalendar().
                 # Do not combine the raw year with the ISO week number alone,
                 # since raw year ≠ ISO year.
-                metadata['week'] = metadata.apply(lambda row: get_iso_year_week(row['year'], row['month'], row['day']), axis=1)
+                metadata['week'] = metadata.apply(lambda row: get_iso_year_week(
+                    row[f'{temp_prefix}year'],
+                    row[f'{temp_prefix}month'],
+                    row[f'{temp_prefix}day']
+                    ), axis=1
+                )
+
+            # Drop the internally used columns.
+            for col in temp_date_cols:
+                metadata.drop(col, axis=1, inplace=True)
 
     unknown_groups = group_by_set - set(metadata.columns)
     if unknown_groups:
@@ -1084,6 +1110,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
         for group in unknown_groups:
             metadata[group] = 'unknown'
 
+    # Finally, determine groups.
     group_by_strain = dict(zip(metadata.index, metadata[group_by].apply(tuple, axis=1)))
     return group_by_strain, skipped_strains