Merge pull request #1422 from nextstrain/filter-min-length-message

filter: Update `--min-length` help message
nextstrain · Feb 23, 2024 · f81e6a3 · f81e6a3
2 parents 14f5ce4 + b86e997
commit f81e6a3
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 2 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,11 @@
 
 ## __NEXT__
 
+### Bug Fixes
+
+* filter: Updated the help and report text of `--min-length` to explicitly state that the minimum length filter only counts standard nucleotide characters A, C, G, or T (case-insensitive). This has been the behavior since version 3.0.3.dev1, but has never been explicitly documented. [#1422][] (@joverlee521)
+
+[#1422]: https://github.com/nextstrain/augur/pull/1422
 
 ## 24.2.2 (16 February 2024)
 

diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -51,7 +51,7 @@ def register_arguments(parser):
         of an entry in --sequences.""")
 
     sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
-    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences")
+    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
     sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
 
     subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")

diff --git a/augur/filter/_run.py b/augur/filter/_run.py
@@ -428,7 +428,7 @@ def run(args):
         include_exclude_rules.filter_by_ambiguous_date.__name__: "{count} {were} dropped because of their ambiguous date in {ambiguity}",
         include_exclude_rules.filter_by_min_date.__name__: "{count} {were} dropped because {they} {were} earlier than {min_date} or missing a date",
         include_exclude_rules.filter_by_max_date.__name__: "{count} {were} dropped because {they} {were} later than {max_date} or missing a date",
-        include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp",
+        include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
         include_exclude_rules.filter_by_non_nucleotide.__name__: "{count} {were} dropped because {they} had non-nucleotide characters",
         include_exclude_rules.skip_group_by_with_ambiguous_year.__name__: "{count} {were} dropped during grouping due to ambiguous year information",
         include_exclude_rules.skip_group_by_with_ambiguous_month.__name__: "{count} {were} dropped during grouping due to ambiguous month information",