Skip to content

Commit

Permalink
filter: Add --query-columns option
Browse files Browse the repository at this point in the history
This serves as an "escape hatch" for when automatic type inference does
not work as expected for whatever reason.
  • Loading branch information
victorlin committed Feb 8, 2024
1 parent f2b807b commit b0a0d11
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 11 deletions.
6 changes: 6 additions & 0 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Filter and subsample a sequence set.
"""
from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from augur.filter.io import ACCEPTED_TYPES, column_type_pair
from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN
from augur.types import EmptyOutputReportingMethod
from . import constants
Expand All @@ -28,6 +29,11 @@ def register_arguments(parser):
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", help=f"""
Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).
Automatic type inference will be attempted on all unspecified columns used in the query.
Example: region:str coverage:float.
""")
metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
Expand Down
46 changes: 35 additions & 11 deletions augur/filter/include_exclude_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
import numpy as np
import pandas as pd
from typing import Any, Callable, Dict, List, Set, Tuple
from typing import Any, Callable, Dict, List, Optional, Set, Tuple

from augur.dates import is_date_ambiguous, get_numerical_dates
from augur.errors import AugurError
Expand Down Expand Up @@ -165,7 +165,7 @@ def filter_by_exclude_where(metadata, exclude_where) -> FilterFunctionReturn:
return filtered


def filter_by_query(metadata: pd.DataFrame, query: str) -> FilterFunctionReturn:
def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[Dict[str, str]] = None) -> FilterFunctionReturn:
"""Filter metadata in the given pandas DataFrame with a query string and return
the strain names that pass the filter.
Expand All @@ -175,6 +175,8 @@ def filter_by_query(metadata: pd.DataFrame, query: str) -> FilterFunctionReturn:
Metadata indexed by strain name
query : str
Query string for the dataframe.
column_types : str
Dict mapping of data type
Examples
--------
Expand All @@ -188,6 +190,9 @@ def filter_by_query(metadata: pd.DataFrame, query: str) -> FilterFunctionReturn:
# Create a copy to prevent modification of the original DataFrame.
metadata_copy = metadata.copy()

if column_types is None:
column_types = {}

# Set columns for type conversion.
variables = extract_variables(query)
if variables is not None:
Expand All @@ -196,16 +201,31 @@ def filter_by_query(metadata: pd.DataFrame, query: str) -> FilterFunctionReturn:
# Column extraction failed. Apply type conversion to all columns.
columns = metadata_copy.columns

# Support numeric comparisons in query strings.
#
# The built-in data type inference when loading the DataFrame does not
# If a type is not explicitly provided, try converting the column to numeric.
# This should cover most use cases, since one common problem is that the
# built-in data type inference when loading the DataFrame does not
# support nullable numeric columns, so numeric comparisons won't work on
# those columns. pd.to_numeric does proper conversion on those columns, and
# will not make any changes to columns with other values.
#
# TODO: Try boolean conversion?
# those columns. pd.to_numeric does proper conversion on those columns,
# and will not make any changes to columns with other values.
for column in columns:
metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore')
column_types.setdefault(column, 'numeric')

# Convert data types before applying the query.
for column, dtype in column_types.items():
if dtype == 'numeric':
metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore')
elif dtype == 'int':
try:
metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='integer')
except ValueError as e:
raise AugurError(f"Failed to convert value in column {column!r} to int. {e}")
elif dtype == 'float':
try:
metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='float')
except ValueError as e:
raise AugurError(f"Failed to convert value in column {column!r} to float. {e}")
elif dtype == 'str':
metadata_copy[column] = metadata_copy[column].astype('str', errors='ignore')

try:
return set(metadata_copy.query(query).index.values)
Expand Down Expand Up @@ -581,9 +601,13 @@ def construct_filters(args, sequence_index) -> Tuple[List[FilterOption], List[Fi

# Exclude strains by metadata, using pandas querying.
if args.query:
kwargs = {"query": args.query}
if args.query_columns:
kwargs["column_types"] = {column: dtype for column, dtype in args.query_columns}

exclude_by.append((
filter_by_query,
{"query": args.query}
kwargs
))

# Filter by ambiguous dates.
Expand Down
22 changes: 22 additions & 0 deletions augur/filter/io.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
import csv
import os
import re
from typing import Sequence, Set
import numpy as np
from collections import defaultdict
Expand Down Expand Up @@ -65,6 +67,26 @@ def write_metadata_based_outputs(input_metadata_path: str, delimiters: Sequence[
output_strains.close()


# These are the types accepted in the following function.
ACCEPTED_TYPES = {'int', 'float', 'str'}

def column_type_pair(input: str):
"""Get a 2-tuple for column name to type.
Intended to be used as the argument type converter for argparse options that
take type maps in a 'column:type' format.
"""

match = re.match(f"^(.+?):({'|'.join(ACCEPTED_TYPES)})$", input)
if not match:
raise argparse.ArgumentTypeError(f"Column data types must be in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).")

column = match[1]
dtype = match[2]

return (column, dtype)


def cleanup_outputs(args):
"""Remove output files. Useful when terminating midway through a loop of metadata chunks."""
if args.output:
Expand Down
55 changes: 55 additions & 0 deletions tests/functional/filter/cram/filter-query-columns.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
Setup

$ source "$TESTDIR"/_setup.sh

Create metadata file for testing.

$ cat >metadata.tsv <<~~
> strain coverage category
> SEQ_1 0.94 A
> SEQ_2 0.95 B
> SEQ_3 0.96 C
> SEQ_4
> ~~

Automatic inference works.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95 & category == 'B'" \
> --output-strains filtered_strains.txt
3 strains were dropped during filtering
3 were filtered out by the query: "coverage >= 0.95 & category == 'B'"
1 strain passed all filters

Specifying coverage:float explicitly also works.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95 & category == 'B'" \
> --query-columns coverage:float \
> --output-strains filtered_strains.txt
3 strains were dropped during filtering
3 were filtered out by the query: "coverage >= 0.95 & category == 'B'"
1 strain passed all filters

Specifying coverage:float category:str also works.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95 & category == 'B'" \
> --query-columns coverage:float category:str \
> --output-strains filtered_strains.txt
3 strains were dropped during filtering
\t3 were filtered out by the query: "coverage >= 0.95 & category == 'B'" (esc)
1 strain passed all filters

Specifying category:float does not work.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95 & category == 'B'" \
> --query-columns category:float \
> --output-strains filtered_strains.txt
ERROR: Failed to convert value in column 'category' to float. Unable to parse string "A" at position 0
[2]
23 changes: 23 additions & 0 deletions tests/functional/filter/cram/filter-query-numerical.t
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,29 @@ The 'category' column will fail when used with a numerical comparison.
Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.
[2]

With automatic type inference, the 'coverage' column isn't query-able with
string comparisons:

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage.str.endswith('.95')" \
> --output-strains filtered_strains.txt > /dev/null
ERROR: Internal Pandas error when applying query:
Can only use .str accessor with string values!
Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.
[2]

However, that is still possible by explicitly specifying that it is a string column.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage.str.endswith('.95')" \
> --query-columns coverage:str \
> --output-strains filtered_strains.txt > /dev/null

$ sort filtered_strains.txt
SEQ_2

Create another metadata file for testing.

$ cat >metadata.tsv <<~~
Expand Down

0 comments on commit b0a0d11

Please sign in to comment.