filter: Add --query-columns option

This serves as an "escape hatch" for when automatic type inference does not work as expected for whatever reason.
nextstrain · Feb 8, 2024 · b0a0d11 · b0a0d11
1 parent f2b807b
commit b0a0d11
Show file tree

Hide file tree

Showing 5 changed files with 141 additions and 11 deletions.
diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -2,6 +2,7 @@
 Filter and subsample a sequence set.
 """
 from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
+from augur.filter.io import ACCEPTED_TYPES, column_type_pair
 from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN
 from augur.types import EmptyOutputReportingMethod
 from . import constants
@@ -28,6 +29,11 @@ def register_arguments(parser):
         Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
         (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
     )
+    metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", help=f"""
+        Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).
+        Automatic type inference will be attempted on all unspecified columns used in the query.
+        Example: region:str coverage:float.
+    """)
     metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
     metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
     metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],

diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
@@ -4,7 +4,7 @@
 import re
 import numpy as np
 import pandas as pd
-from typing import Any, Callable, Dict, List, Set, Tuple
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple
 
 from augur.dates import is_date_ambiguous, get_numerical_dates
 from augur.errors import AugurError
@@ -165,7 +165,7 @@ def filter_by_exclude_where(metadata, exclude_where) -> FilterFunctionReturn:
     return filtered
 
 
-def filter_by_query(metadata: pd.DataFrame, query: str) -> FilterFunctionReturn:
+def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[Dict[str, str]] = None) -> FilterFunctionReturn:
     """Filter metadata in the given pandas DataFrame with a query string and return
     the strain names that pass the filter.
 
@@ -175,6 +175,8 @@ def filter_by_query(metadata: pd.DataFrame, query: str) -> FilterFunctionReturn:
         Metadata indexed by strain name
     query : str
         Query string for the dataframe.
+    column_types : str
+        Dict mapping of data type
 
     Examples
     --------
@@ -188,6 +190,9 @@ def filter_by_query(metadata: pd.DataFrame, query: str) -> FilterFunctionReturn:
     # Create a copy to prevent modification of the original DataFrame.
     metadata_copy = metadata.copy()
 
+    if column_types is None:
+        column_types = {}
+
     # Set columns for type conversion.
     variables = extract_variables(query)
     if variables is not None:
@@ -196,16 +201,31 @@ def filter_by_query(metadata: pd.DataFrame, query: str) -> FilterFunctionReturn:
         # Column extraction failed. Apply type conversion to all columns.
         columns = metadata_copy.columns
 
-    # Support numeric comparisons in query strings.
-    #
-    # The built-in data type inference when loading the DataFrame does not
+    # If a type is not explicitly provided, try converting the column to numeric.
+    # This should cover most use cases, since one common problem is that the
+    # built-in data type inference when loading the DataFrame does not
     # support nullable numeric columns, so numeric comparisons won't work on
-    # those columns. pd.to_numeric does proper conversion on those columns, and
-    # will not make any changes to columns with other values.
-    #
-    # TODO: Try boolean conversion?
+    # those columns. pd.to_numeric does proper conversion on those columns,
+    # and will not make any changes to columns with other values.
     for column in columns:
-        metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore')
+        column_types.setdefault(column, 'numeric')
+
+    # Convert data types before applying the query.
+    for column, dtype in column_types.items():
+        if dtype == 'numeric':
+            metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore')
+        elif dtype == 'int':
+            try:
+                metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='integer')
+            except ValueError as e:
+                raise AugurError(f"Failed to convert value in column {column!r} to int. {e}")
+        elif dtype == 'float':
+            try:
+                metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='float')
+            except ValueError as e:
+                raise AugurError(f"Failed to convert value in column {column!r} to float. {e}")
+        elif dtype == 'str':
+            metadata_copy[column] = metadata_copy[column].astype('str', errors='ignore')
 
     try:
         return set(metadata_copy.query(query).index.values)
@@ -581,9 +601,13 @@ def construct_filters(args, sequence_index) -> Tuple[List[FilterOption], List[Fi
 
     # Exclude strains by metadata, using pandas querying.
     if args.query:
+        kwargs = {"query": args.query}
+        if args.query_columns:
+            kwargs["column_types"] = {column: dtype for column, dtype in args.query_columns}
+
         exclude_by.append((
             filter_by_query,
-            {"query": args.query}
+            kwargs
         ))
 
     # Filter by ambiguous dates.

diff --git a/augur/filter/io.py b/augur/filter/io.py
@@ -1,5 +1,7 @@
+import argparse
 import csv
 import os
+import re
 from typing import Sequence, Set
 import numpy as np
 from collections import defaultdict
@@ -65,6 +67,26 @@ def write_metadata_based_outputs(input_metadata_path: str, delimiters: Sequence[
         output_strains.close()
 
 
+# These are the types accepted in the following function.
+ACCEPTED_TYPES = {'int', 'float', 'str'}
+
+def column_type_pair(input: str):
+    """Get a 2-tuple for column name to type.
+
+    Intended to be used as the argument type converter for argparse options that
+    take type maps in a 'column:type' format.
+    """
+
+    match = re.match(f"^(.+?):({'|'.join(ACCEPTED_TYPES)})$", input)
+    if not match:
+        raise argparse.ArgumentTypeError(f"Column data types must be in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).")
+
+    column = match[1]
+    dtype = match[2]
+
+    return (column, dtype)
+
+
 def cleanup_outputs(args):
     """Remove output files. Useful when terminating midway through a loop of metadata chunks."""
     if args.output:

diff --git a/tests/functional/filter/cram/filter-query-columns.t b/tests/functional/filter/cram/filter-query-columns.t
@@ -0,0 +1,55 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Create metadata file for testing.
+
+  $ cat >metadata.tsv <<~~
+  > strain	coverage	category
+  > SEQ_1	0.94	A
+  > SEQ_2	0.95	B
+  > SEQ_3	0.96	C
+  > SEQ_4		
+  > ~~
+
+Automatic inference works.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "coverage >= 0.95 & category == 'B'" \
+  >  --output-strains filtered_strains.txt
+  3 strains were dropped during filtering
+  	3 were filtered out by the query: "coverage >= 0.95 & category == 'B'"
+  1 strain passed all filters
+
+Specifying coverage:float explicitly also works.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "coverage >= 0.95 & category == 'B'" \
+  >  --query-columns coverage:float \
+  >  --output-strains filtered_strains.txt
+  3 strains were dropped during filtering
+  	3 were filtered out by the query: "coverage >= 0.95 & category == 'B'"
+  1 strain passed all filters
+
+Specifying coverage:float category:str also works.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "coverage >= 0.95 & category == 'B'" \
+  >  --query-columns coverage:float category:str \
+  >  --output-strains filtered_strains.txt
+  3 strains were dropped during filtering
+  \t3 were filtered out by the query: "coverage >= 0.95 & category == 'B'" (esc)
+  1 strain passed all filters
+
+Specifying category:float does not work.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "coverage >= 0.95 & category == 'B'" \
+  >  --query-columns category:float \
+  >  --output-strains filtered_strains.txt
+  ERROR: Failed to convert value in column 'category' to float. Unable to parse string "A" at position 0
+  [2]
diff --git a/tests/functional/filter/cram/filter-query-numerical.t b/tests/functional/filter/cram/filter-query-numerical.t
@@ -34,6 +34,29 @@ The 'category' column will fail when used with a numerical comparison.
   Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.
   [2]
 
+With automatic type inference, the 'coverage' column isn't query-able with
+string comparisons:
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "coverage.str.endswith('.95')" \
+  >  --output-strains filtered_strains.txt > /dev/null
+  ERROR: Internal Pandas error when applying query:
+  	Can only use .str accessor with string values!
+  Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.
+  [2]
+
+However, that is still possible by explicitly specifying that it is a string column.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "coverage.str.endswith('.95')" \
+  >  --query-columns coverage:str \
+  >  --output-strains filtered_strains.txt > /dev/null
+
+  $ sort filtered_strains.txt
+  SEQ_2
+
 Create another metadata file for testing.
 
   $ cat >metadata.tsv <<~~