diff --git a/CHANGES.md b/CHANGES.md index 2ffb17681..40d55200d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,11 +2,16 @@ ## __NEXT__ +### Features + +* filter: Added a new option `--max-length` to filter out sequences that are longer than a certain amount of base pairs. [#1429][] (@victorlin) + ### Bug Fixes * filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin) [#1425]: https://github.com/nextstrain/augur/pull/1425 +[#1429]: https://github.com/nextstrain/augur/pull/1429 ## 24.2.3 (23 February 2024) diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index 3e647f2c3..6fb09cae6 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -52,6 +52,7 @@ def register_arguments(parser): sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data") sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)") + sequence_filter_group.add_argument('--max-length', type=int, help="maximum length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)") sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters") subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data") diff --git a/augur/filter/_run.py b/augur/filter/_run.py index 57a25526e..3cab70cae 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -428,7 +428,8 @@ def run(args): include_exclude_rules.filter_by_ambiguous_date.__name__: "{count} {were} dropped because of their ambiguous date in {ambiguity}", include_exclude_rules.filter_by_min_date.__name__: "{count} {were} dropped because {they} {were} earlier than {min_date} or missing a date", include_exclude_rules.filter_by_max_date.__name__: "{count} {were} dropped because {they} {were} later than {max_date} or missing a date", - include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)", + include_exclude_rules.filter_by_min_length.__name__: "{count} {were} dropped because {they} {were} shorter than the minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)", + include_exclude_rules.filter_by_max_length.__name__: "{count} {were} dropped because {they} {were} longer than the maximum length of {max_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)", include_exclude_rules.filter_by_non_nucleotide.__name__: "{count} {were} dropped because {they} had non-nucleotide characters", include_exclude_rules.skip_group_by_with_ambiguous_year.__name__: "{count} {were} dropped during grouping due to ambiguous year information", include_exclude_rules.skip_group_by_with_ambiguous_month.__name__: "{count} {were} dropped during grouping due to ambiguous month information", diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py index a8de9aba7..f6d362779 100644 --- a/augur/filter/include_exclude_rules.py +++ b/augur/filter/include_exclude_rules.py @@ -424,7 +424,7 @@ def filter_by_sequence_index(metadata, sequence_index) -> FilterFunctionReturn: return metadata_strains & sequence_index_strains -def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFunctionReturn: +def filter_by_min_length(metadata, sequence_index, min_length) -> FilterFunctionReturn: """Filter metadata by sequence length from a given sequence index. Parameters @@ -440,13 +440,13 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun -------- >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain") - >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000) + >>> filter_by_min_length(metadata, sequence_index, min_length=27000) {'strain1'} It is possible for the sequence index to be missing strains present in the metadata. >>> sequence_index = pd.DataFrame([{"strain": "strain3", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain") - >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000) + >>> filter_by_min_length(metadata, sequence_index, min_length=27000) set() """ @@ -459,6 +459,34 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun return set(filtered_sequence_index[filtered_sequence_index["ACGT"] >= min_length].index.values) +def filter_by_max_length(metadata, sequence_index, max_length) -> FilterFunctionReturn: + """Filter metadata by sequence length from a given sequence index. + + Parameters + ---------- + metadata : pandas.DataFrame + Metadata indexed by strain name + sequence_index : pandas.DataFrame + Sequence index + max_length : int + Maximum number of standard nucleotide characters (A, C, G, or T) in each sequence + + Examples + -------- + >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) + >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain") + >>> filter_by_max_length(metadata, sequence_index, max_length=27000) + {'strain2'} + """ + strains = set(metadata.index.values) + filtered_sequence_index = sequence_index.loc[ + sequence_index.index.intersection(strains) + ] + filtered_sequence_index["ACGT"] = filtered_sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1) + + return set(filtered_sequence_index[filtered_sequence_index["ACGT"] <= max_length].index.values) + + def filter_by_non_nucleotide(metadata, sequence_index) -> FilterFunctionReturn: """Filter metadata for strains with invalid nucleotide content. @@ -667,21 +695,31 @@ def construct_filters(args, sequence_index) -> Tuple[List[FilterOption], List[Fi )) # Filter by sequence length. + # Skip VCF files and warn the user that length filters do not + # make sense for VCFs. + is_vcf = filename_is_vcf(args.sequences) if args.min_length: - # Skip VCF files and warn the user that the min length filter does not - # make sense for VCFs. - is_vcf = filename_is_vcf(args.sequences) - - if is_vcf: #doesn't make sense for VCF, ignore. + if is_vcf: print_err("WARNING: Cannot use min_length for VCF files. Ignoring...") else: exclude_by.append(( - filter_by_sequence_length, + filter_by_min_length, { "sequence_index": sequence_index, "min_length": args.min_length, } )) + if args.max_length: + if is_vcf: + print_err("WARNING: Cannot use max_length for VCF files. Ignoring...") + else: + exclude_by.append(( + filter_by_max_length, + { + "sequence_index": sequence_index, + "max_length": args.max_length, + } + )) # Exclude sequences with non-nucleotide characters. if args.non_nucleotide: @@ -755,13 +793,13 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil annotated in a sequence index. >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}, {"strain": "strain3", "A": 1250, "C": 1250, "G": 1250, "T": 1250}]).set_index("strain") - >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})] + >>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})] >>> include_by = [(force_include_where, {"include_where": "region=Europe"})] >>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by) >>> strains_to_keep {'strain1'} >>> sorted(strains_to_exclude, key=lambda record: record["strain"]) - [{'strain': 'strain2', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}] + [{'strain': 'strain2', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}] >>> strains_to_include [{'strain': 'strain2', 'filter': 'force_include_where', 'kwargs': '[["include_where", "region=Europe"]]'}] @@ -847,9 +885,9 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs): Examples -------- >>> from augur.dates import numeric_date - >>> from augur.filter.include_exclude_rules import filter_by_sequence_length, filter_by_min_date + >>> from augur.filter.include_exclude_rules import filter_by_min_length, filter_by_min_date >>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}, {"strain": "strain2", "ACGT": 26000}, {"strain": "strain3", "ACGT": 5000}]).set_index("strain") - >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})] + >>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})] >>> _filter_kwargs_to_str(exclude_by[0][1]) '[["min_length", 27000]]' >>> exclude_by = [(filter_by_min_date, {"date_column": "date", "min_date": numeric_date("2020-03-01")})] diff --git a/augur/filter/validate_arguments.py b/augur/filter/validate_arguments.py index 866989303..49ee310bd 100644 --- a/augur/filter/validate_arguments.py +++ b/augur/filter/validate_arguments.py @@ -4,6 +4,7 @@ SEQUENCE_ONLY_FILTERS = ( "min_length", + "max_length", "non_nucleotide", ) diff --git a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t deleted file mode 100644 index 050db8b26..000000000 --- a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t +++ /dev/null @@ -1,13 +0,0 @@ -Setup - - $ source "$TESTDIR"/_setup.sh - -Try to filter using only metadata without a sequence index. -This should fail because the requested filters rely on sequence information. - - $ ${AUGUR} filter \ - > --metadata "$TESTDIR/../data/metadata.tsv" \ - > --min-length 10000 \ - > --output-strains filtered_strains.txt > /dev/null - ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information. - [2] diff --git a/tests/functional/filter/cram/filter-min-length-output-metadata.t b/tests/functional/filter/cram/filter-min-length-output-metadata.t deleted file mode 100644 index 9cb7f26c2..000000000 --- a/tests/functional/filter/cram/filter-min-length-output-metadata.t +++ /dev/null @@ -1,17 +0,0 @@ -Setup - - $ source "$TESTDIR"/_setup.sh - -Filter using only metadata without sequence input or output and save results as filtered metadata. - - $ ${AUGUR} filter \ - > --sequence-index "$TESTDIR/../data/sequence_index.tsv" \ - > --metadata "$TESTDIR/../data/metadata.tsv" \ - > --min-date 2012 \ - > --min-length 10500 \ - > --output-metadata filtered_metadata.tsv > /dev/null - -Output should include the 8 sequences matching the filters and a header line. - - $ wc -l filtered_metadata.tsv - \s*9 .* (re) diff --git a/tests/functional/filter/cram/filter-min-length-output-strains.t b/tests/functional/filter/cram/filter-min-length-output-strains.t deleted file mode 100644 index e0486eb89..000000000 --- a/tests/functional/filter/cram/filter-min-length-output-strains.t +++ /dev/null @@ -1,17 +0,0 @@ -Setup - - $ source "$TESTDIR"/_setup.sh - -Filter using only metadata and save results as a list of filtered strains. - - $ ${AUGUR} filter \ - > --sequence-index "$TESTDIR/../data/sequence_index.tsv" \ - > --metadata "$TESTDIR/../data/metadata.tsv" \ - > --min-date 2012 \ - > --min-length 10500 \ - > --output-strains filtered_strains.txt > /dev/null - -Output should include only the 8 sequences matching the filters (without a header line). - - $ wc -l filtered_strains.txt - \s*8 .* (re) diff --git a/tests/functional/filter/cram/filter-no-sequence-index-error.t b/tests/functional/filter/cram/filter-no-sequence-index-error.t new file mode 100644 index 000000000..657463da5 --- /dev/null +++ b/tests/functional/filter/cram/filter-no-sequence-index-error.t @@ -0,0 +1,28 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Try to filter using only metadata without a sequence index. + +These should fail because the requested filters rely on sequence information. + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --min-length 10000 \ + > --output-strains filtered_strains.txt > /dev/null + ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information. + [2] + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --max-length 10000 \ + > --output-strains filtered_strains.txt > /dev/null + ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information. + [2] + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --non-nucleotide \ + > --output-strains filtered_strains.txt > /dev/null + ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information. + [2] diff --git a/tests/functional/filter/cram/filter-sequence-length.t b/tests/functional/filter/cram/filter-sequence-length.t new file mode 100644 index 000000000..abd5303bd --- /dev/null +++ b/tests/functional/filter/cram/filter-sequence-length.t @@ -0,0 +1,18 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Filter using --min-length and --max-length. + + $ ${AUGUR} filter \ + > --sequence-index "$TESTDIR/../data/sequence_index.tsv" \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --min-length 10500 \ + > --max-length 10700 \ + > --output-strains filtered_strains.txt + 7 strains were dropped during filtering + 1 had no metadata + 1 had no sequence data + 2 were dropped because they were shorter than the minimum length of 10500bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive) + 3 were dropped because they were longer than the maximum length of 10700bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive) + 6 strains passed all filters