From 79262c005e282890f10d1e44a5ce2cbfdf79a630 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 15 Feb 2024 15:18:59 -0800 Subject: [PATCH] Replace backtick quoting in Pandas query ast.walk does not support the special case of backtick quoting that is supported by pandas.DataFrame.query, so it must be replaced for ast.walk then reversed to obtain the actual variable names. --- augur/filter/include_exclude_rules.py | 54 ++++++++++++++++--- .../cram/filter-query-backtick-quoting.t | 8 --- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py index a333914f7..a8de9aba7 100644 --- a/augur/filter/include_exclude_rules.py +++ b/augur/filter/include_exclude_rules.py @@ -891,15 +891,55 @@ def extract_variables(pandas_query: str): >>> extract_variables("var1.str.startswith('prefix')") {'var1'} >>> extract_variables("this query is invalid") + + Backtick quoting is also supported. + + >>> extract_variables("`include me` == 'but not `me`'") + {'include me'} + >>> extract_variables("`include me once` == 'a' or `include me once` == 'b'") + {'include me once'} """ - # Since Pandas' query grammar should be a subset of Python's, which uses the + # Since Pandas's query grammar is mostly a subset of Python's, which uses the # ast stdlib under the hood, we can try to parse queries with that as well. - # Errors may arise from invalid query syntax or any Pandas syntax not - # covered by Python (unlikely, but I'm not sure). In those cases, don't - # return anything. + # Errors may arise from invalid query syntax or any unhandled Pandas-specific + # syntax. In those cases, don't return anything. try: - return set(node.id - for node in ast.walk(ast.parse(pandas_query)) - if isinstance(node, ast.Name)) + # Replace the backtick quoting that is Pandas-specific syntax. + modified_query, replacements = _replace_backtick_quoting(pandas_query) + variables = set(node.id + for node in ast.walk(ast.parse(modified_query)) + if isinstance(node, ast.Name)) + for original_name, generated_name in replacements.items(): + if generated_name in variables: + variables.remove(generated_name) + variables.add(original_name) + return variables except: return None + + +def _replace_backtick_quoting(pandas_query: str): + """Replace backtick-quoted values with a generated value. + + The generated value can be translated as a valid name (i.e. no spaces or + special characters). + + Return the modified query and a dict mapping from the original to generated + value. + """ + pattern = r"`([^`]+)`" + replacements: Dict[str, str] = {} + name_counter = 1 + + def replace(match: re.Match): + nonlocal replacements + nonlocal name_counter + original_value = match.group(1) + + if original_value not in replacements: + replacements[original_value] = f'__augur_filter_{name_counter}' + name_counter += 1 + return replacements[original_value] + + modified_query = re.sub(pattern, replace, pandas_query) + return modified_query, replacements diff --git a/tests/functional/filter/cram/filter-query-backtick-quoting.t b/tests/functional/filter/cram/filter-query-backtick-quoting.t index bd74b45d0..84c9e48a4 100644 --- a/tests/functional/filter/cram/filter-query-backtick-quoting.t +++ b/tests/functional/filter/cram/filter-query-backtick-quoting.t @@ -18,14 +18,6 @@ The 'region name' column is query-able by backtick quoting. > --metadata metadata.tsv \ > --query '(`region name` == "A")' \ > --output-strains filtered_strains.txt > /dev/null - WARNING: Could not infer columns from the pandas query. Reading all metadata columns, - which may impact execution time. If the query is valid, please open a new issue: - - - - and add the query to the description: - - (`region name` == "A") $ sort filtered_strains.txt SEQ_1