Support nullable boolean columns

This was not previously supported by pandas.read_csv's built-in type inference, but it aligns with the existing support for nullable numeric columns.
nextstrain · Feb 9, 2024 · bd35393 · bd35393
1 parent df97f72
commit bd35393
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 1 deletion.
diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
@@ -244,14 +244,16 @@ def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[D
 
 
 def _string_to_boolean(s: str):
-    """Convert a string to a boolean value.
+    """Convert a string to an optional boolean value.
 
     Raises ValueError if it cannot be converted.
     """
     if s.lower() == 'true':
         return True
     elif s.lower() == 'false':
         return False
+    elif s == '':
+        return None
 
     raise ValueError(f"Unable to convert {s!r} to a boolean value.")
 

diff --git a/tests/functional/filter/cram/filter-query-boolean.t b/tests/functional/filter/cram/filter-query-boolean.t
@@ -56,3 +56,23 @@ Note that 1/0 can also be compared to boolean literals.
   1 strain was dropped during filtering
   	1 was filtered out by the query: "column == True"
   2 strains passed all filters
+
+Empty values are ignored.
+
+  $ cat >metadata.tsv <<~~
+  > strain	column
+  > SEQ_1	True
+  > SEQ_2	False
+  > SEQ_3	
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "column == True" \
+  >  --output-strains filtered_strains.txt
+  2 strains were dropped during filtering
+  	2 were filtered out by the query: "column == True"
+  1 strain passed all filters
+
+  $ sort filtered_strains.txt
+  SEQ_1