Skip to content

Commit

Permalink
Replace backtick quoting in Pandas query
Browse files Browse the repository at this point in the history
ast.walk does not support the special case of backtick quoting that is
supported by pandas.DataFrame.query, so it must be replaced for ast.walk
then reversed to obtain the actual variable names.
  • Loading branch information
victorlin committed Feb 16, 2024
1 parent 71015ed commit 79262c0
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 15 deletions.
54 changes: 47 additions & 7 deletions augur/filter/include_exclude_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,15 +891,55 @@ def extract_variables(pandas_query: str):
>>> extract_variables("var1.str.startswith('prefix')")
{'var1'}
>>> extract_variables("this query is invalid")
Backtick quoting is also supported.
>>> extract_variables("`include me` == 'but not `me`'")
{'include me'}
>>> extract_variables("`include me once` == 'a' or `include me once` == 'b'")
{'include me once'}
"""
# Since Pandas' query grammar should be a subset of Python's, which uses the
# Since Pandas's query grammar is mostly a subset of Python's, which uses the
# ast stdlib under the hood, we can try to parse queries with that as well.
# Errors may arise from invalid query syntax or any Pandas syntax not
# covered by Python (unlikely, but I'm not sure). In those cases, don't
# return anything.
# Errors may arise from invalid query syntax or any unhandled Pandas-specific
# syntax. In those cases, don't return anything.
try:
return set(node.id
for node in ast.walk(ast.parse(pandas_query))
if isinstance(node, ast.Name))
# Replace the backtick quoting that is Pandas-specific syntax.
modified_query, replacements = _replace_backtick_quoting(pandas_query)
variables = set(node.id
for node in ast.walk(ast.parse(modified_query))
if isinstance(node, ast.Name))
for original_name, generated_name in replacements.items():
if generated_name in variables:
variables.remove(generated_name)
variables.add(original_name)
return variables
except:
return None


def _replace_backtick_quoting(pandas_query: str):
"""Replace backtick-quoted values with a generated value.
The generated value can be translated as a valid name (i.e. no spaces or
special characters).
Return the modified query and a dict mapping from the original to generated
value.
"""
pattern = r"`([^`]+)`"
replacements: Dict[str, str] = {}
name_counter = 1

def replace(match: re.Match):
nonlocal replacements
nonlocal name_counter
original_value = match.group(1)

if original_value not in replacements:
replacements[original_value] = f'__augur_filter_{name_counter}'
name_counter += 1
return replacements[original_value]

modified_query = re.sub(pattern, replace, pandas_query)
return modified_query, replacements
8 changes: 0 additions & 8 deletions tests/functional/filter/cram/filter-query-backtick-quoting.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,6 @@ The 'region name' column is query-able by backtick quoting.
> --metadata metadata.tsv \
> --query '(`region name` == "A")' \
> --output-strains filtered_strains.txt > /dev/null
WARNING: Could not infer columns from the pandas query. Reading all metadata columns,
which may impact execution time. If the query is valid, please open a new issue:

<https://github.com/nextstrain/augur/issues/new/choose>

and add the query to the description:

(`region name` == "A")

$ sort filtered_strains.txt
SEQ_1
Expand Down

0 comments on commit 79262c0

Please sign in to comment.