From ea4c7cdd6ec2469a2b703a568f71ebca2968b169 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Mon, 24 Oct 2022 14:18:23 -0700
Subject: [PATCH] filter: Use intermediate columns for grouping

Previously:

1. year/month/day columns were created on the DataFrame used for grouping.
2. month was overwritten with (year, month).

This allowed year to be used for grouping, but also the same for day
which was an unintended side effect. Instead of adding columns with
those names, use names with a more distinct prefix that can be safely
discarded before grouping happens.
---
 augur/filter.py | 55 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/augur/filter.py b/augur/filter.py
index 7cc278221..de40ea8cc 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -1030,52 +1030,78 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
             df_dates = pd.DataFrame({col: 'unknown' for col in GROUP_BY_GENERATED_COLUMNS}, index=metadata.index)
             metadata = pd.concat([metadata, df_dates], axis=1)
         else:
-            # Replace date with year/month/day as nullable ints.
-            date_cols = ['year', 'month', 'day']
+            # Create a DataFrame with year/month/day columns as nullable ints.
+            # These columns are prefixed to note temporary usage. They are used
+            # to generate other columns, and will be discarded at the end.
+            temp_prefix = '_augur_filter_'
+            temp_date_cols = [f'{temp_prefix}year', f'{temp_prefix}month', f'{temp_prefix}day']
             df_dates = metadata['date'].str.split('-', n=2, expand=True)
-            df_dates = df_dates.set_axis(date_cols[:len(df_dates.columns)], axis=1)
-            missing_date_cols = set(date_cols) - set(df_dates.columns)
+            df_dates = df_dates.set_axis(temp_date_cols[:len(df_dates.columns)], axis=1)
+            missing_date_cols = set(temp_date_cols) - set(df_dates.columns)
             for col in missing_date_cols:
                 df_dates[col] = pd.NA
-            for col in date_cols:
+            for col in temp_date_cols:
                 df_dates[col] = pd.to_numeric(df_dates[col], errors='coerce').astype(pd.Int64Dtype())
+
+            # Extend metadata with generated date columns
+            # Drop the 'date' column since it should not be used for grouping.
             metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1)
             if 'year' in generated_columns_requested:
                 # Skip ambiguous years.
-                df_skip = metadata[metadata['year'].isnull()]
-                metadata.dropna(subset=['year'], inplace=True)
+                df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
+                metadata.dropna(subset=[f'{temp_prefix}year'], inplace=True)
                 for strain in df_skip.index:
                     skipped_strains.append({
                         "strain": strain,
                         "filter": "skip_group_by_with_ambiguous_year",
                         "kwargs": "",
                     })
+
+                # Make a generated 'year' column available for grouping.
+                metadata['year'] = metadata[f'{temp_prefix}year']
+
             if 'month' in generated_columns_requested:
                 # Skip ambiguous months.
-                df_skip = metadata[metadata['month'].isnull()]
-                metadata.dropna(subset=['month'], inplace=True)
+                df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
+                metadata.dropna(subset=[f'{temp_prefix}month'], inplace=True)
                 for strain in df_skip.index:
                     skipped_strains.append({
                         "strain": strain,
                         "filter": "skip_group_by_with_ambiguous_month",
                         "kwargs": "",
                     })
-                # month = (year, month)
-                metadata['month'] = list(zip(metadata['year'], metadata['month']))
+
+                # Make a generated 'month' column available for grouping.
+                metadata['month'] = list(zip(
+                    metadata[f'{temp_prefix}year'],
+                    metadata[f'{temp_prefix}month']
+                ))
+
             if 'week' in generated_columns_requested:
                 # Skip ambiguous days.
-                df_skip = metadata[metadata['day'].isnull()]
-                metadata.dropna(subset=['day'], inplace=True)
+                df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
+                metadata.dropna(subset=[f'{temp_prefix}day'], inplace=True)
                 for strain in df_skip.index:
                     skipped_strains.append({
                         "strain": strain,
                         "filter": "skip_group_by_with_ambiguous_day",
                         "kwargs": "",
                     })
+
+                # Make a generated 'week' column available for grouping.
                 # Note that week = (year, week) from the date.isocalendar().
                 # Do not combine the raw year with the ISO week number alone,
                 # since raw year ≠ ISO year.
-                metadata['week'] = metadata.apply(lambda row: get_iso_year_week(row['year'], row['month'], row['day']), axis=1)
+                metadata['week'] = metadata.apply(lambda row: get_iso_year_week(
+                    row[f'{temp_prefix}year'],
+                    row[f'{temp_prefix}month'],
+                    row[f'{temp_prefix}day']
+                    ), axis=1
+                )
+
+            # Drop the internally used columns.
+            for col in temp_date_cols:
+                metadata.drop(col, axis=1, inplace=True)
 
     unknown_groups = group_by_set - set(metadata.columns)
     if unknown_groups:
@@ -1084,6 +1110,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
         for group in unknown_groups:
             metadata[group] = 'unknown'
 
+    # Finally, determine groups.
     group_by_strain = dict(zip(metadata.index, metadata[group_by].apply(tuple, axis=1)))
     return group_by_strain, skipped_strains