Skip to content

Commit

Permalink
Merge pull request #1429: filter: Add --max-length option
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin authored Mar 7, 2024
2 parents b0c5010 + 7c7353b commit 7197b9f
Show file tree
Hide file tree
Showing 10 changed files with 106 additions and 61 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

## __NEXT__

### Features

* filter: Added a new option `--max-length` to filter out sequences that are longer than a certain amount of base pairs. [#1429][] (@victorlin)

### Bug Fixes

* filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)

[#1425]: https://github.com/nextstrain/augur/pull/1425
[#1429]: https://github.com/nextstrain/augur/pull/1429

## 24.2.3 (23 February 2024)

Expand Down
1 change: 1 addition & 0 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def register_arguments(parser):

sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
sequence_filter_group.add_argument('--max-length', type=int, help="maximum length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")

subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
Expand Down
3 changes: 2 additions & 1 deletion augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,8 @@ def run(args):
include_exclude_rules.filter_by_ambiguous_date.__name__: "{count} {were} dropped because of their ambiguous date in {ambiguity}",
include_exclude_rules.filter_by_min_date.__name__: "{count} {were} dropped because {they} {were} earlier than {min_date} or missing a date",
include_exclude_rules.filter_by_max_date.__name__: "{count} {were} dropped because {they} {were} later than {max_date} or missing a date",
include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
include_exclude_rules.filter_by_min_length.__name__: "{count} {were} dropped because {they} {were} shorter than the minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
include_exclude_rules.filter_by_max_length.__name__: "{count} {were} dropped because {they} {were} longer than the maximum length of {max_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
include_exclude_rules.filter_by_non_nucleotide.__name__: "{count} {were} dropped because {they} had non-nucleotide characters",
include_exclude_rules.skip_group_by_with_ambiguous_year.__name__: "{count} {were} dropped during grouping due to ambiguous year information",
include_exclude_rules.skip_group_by_with_ambiguous_month.__name__: "{count} {were} dropped during grouping due to ambiguous month information",
Expand Down
64 changes: 51 additions & 13 deletions augur/filter/include_exclude_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def filter_by_sequence_index(metadata, sequence_index) -> FilterFunctionReturn:
return metadata_strains & sequence_index_strains


def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFunctionReturn:
def filter_by_min_length(metadata, sequence_index, min_length) -> FilterFunctionReturn:
"""Filter metadata by sequence length from a given sequence index.
Parameters
Expand All @@ -440,13 +440,13 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun
--------
>>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
>>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
>>> filter_by_min_length(metadata, sequence_index, min_length=27000)
{'strain1'}
It is possible for the sequence index to be missing strains present in the metadata.
>>> sequence_index = pd.DataFrame([{"strain": "strain3", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
>>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
>>> filter_by_min_length(metadata, sequence_index, min_length=27000)
set()
"""
Expand All @@ -459,6 +459,34 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun
return set(filtered_sequence_index[filtered_sequence_index["ACGT"] >= min_length].index.values)


def filter_by_max_length(metadata, sequence_index, max_length) -> FilterFunctionReturn:
"""Filter metadata by sequence length from a given sequence index.
Parameters
----------
metadata : pandas.DataFrame
Metadata indexed by strain name
sequence_index : pandas.DataFrame
Sequence index
max_length : int
Maximum number of standard nucleotide characters (A, C, G, or T) in each sequence
Examples
--------
>>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
>>> filter_by_max_length(metadata, sequence_index, max_length=27000)
{'strain2'}
"""
strains = set(metadata.index.values)
filtered_sequence_index = sequence_index.loc[
sequence_index.index.intersection(strains)
]
filtered_sequence_index["ACGT"] = filtered_sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1)

return set(filtered_sequence_index[filtered_sequence_index["ACGT"] <= max_length].index.values)


def filter_by_non_nucleotide(metadata, sequence_index) -> FilterFunctionReturn:
"""Filter metadata for strains with invalid nucleotide content.
Expand Down Expand Up @@ -667,21 +695,31 @@ def construct_filters(args, sequence_index) -> Tuple[List[FilterOption], List[Fi
))

# Filter by sequence length.
# Skip VCF files and warn the user that length filters do not
# make sense for VCFs.
is_vcf = filename_is_vcf(args.sequences)
if args.min_length:
# Skip VCF files and warn the user that the min length filter does not
# make sense for VCFs.
is_vcf = filename_is_vcf(args.sequences)

if is_vcf: #doesn't make sense for VCF, ignore.
if is_vcf:
print_err("WARNING: Cannot use min_length for VCF files. Ignoring...")
else:
exclude_by.append((
filter_by_sequence_length,
filter_by_min_length,
{
"sequence_index": sequence_index,
"min_length": args.min_length,
}
))
if args.max_length:
if is_vcf:
print_err("WARNING: Cannot use max_length for VCF files. Ignoring...")
else:
exclude_by.append((
filter_by_max_length,
{
"sequence_index": sequence_index,
"max_length": args.max_length,
}
))

# Exclude sequences with non-nucleotide characters.
if args.non_nucleotide:
Expand Down Expand Up @@ -755,13 +793,13 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
annotated in a sequence index.
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}, {"strain": "strain3", "A": 1250, "C": 1250, "G": 1250, "T": 1250}]).set_index("strain")
>>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> include_by = [(force_include_where, {"include_where": "region=Europe"})]
>>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by)
>>> strains_to_keep
{'strain1'}
>>> sorted(strains_to_exclude, key=lambda record: record["strain"])
[{'strain': 'strain2', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}]
[{'strain': 'strain2', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}]
>>> strains_to_include
[{'strain': 'strain2', 'filter': 'force_include_where', 'kwargs': '[["include_where", "region=Europe"]]'}]
Expand Down Expand Up @@ -847,9 +885,9 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs):
Examples
--------
>>> from augur.dates import numeric_date
>>> from augur.filter.include_exclude_rules import filter_by_sequence_length, filter_by_min_date
>>> from augur.filter.include_exclude_rules import filter_by_min_length, filter_by_min_date
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}, {"strain": "strain2", "ACGT": 26000}, {"strain": "strain3", "ACGT": 5000}]).set_index("strain")
>>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> _filter_kwargs_to_str(exclude_by[0][1])
'[["min_length", 27000]]'
>>> exclude_by = [(filter_by_min_date, {"date_column": "date", "min_date": numeric_date("2020-03-01")})]
Expand Down
1 change: 1 addition & 0 deletions augur/filter/validate_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

SEQUENCE_ONLY_FILTERS = (
"min_length",
"max_length",
"non_nucleotide",
)

Expand Down

This file was deleted.

17 changes: 0 additions & 17 deletions tests/functional/filter/cram/filter-min-length-output-metadata.t

This file was deleted.

17 changes: 0 additions & 17 deletions tests/functional/filter/cram/filter-min-length-output-strains.t

This file was deleted.

28 changes: 28 additions & 0 deletions tests/functional/filter/cram/filter-no-sequence-index-error.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Setup

$ source "$TESTDIR"/_setup.sh

Try to filter using only metadata without a sequence index.

These should fail because the requested filters rely on sequence information.

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --min-length 10000 \
> --output-strains filtered_strains.txt > /dev/null
ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
[2]

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --max-length 10000 \
> --output-strains filtered_strains.txt > /dev/null
ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
[2]

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --non-nucleotide \
> --output-strains filtered_strains.txt > /dev/null
ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
[2]
18 changes: 18 additions & 0 deletions tests/functional/filter/cram/filter-sequence-length.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Setup

$ source "$TESTDIR"/_setup.sh

Filter using --min-length and --max-length.

$ ${AUGUR} filter \
> --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --min-length 10500 \
> --max-length 10700 \
> --output-strains filtered_strains.txt
7 strains were dropped during filtering
1 had no metadata
1 had no sequence data
2 were dropped because they were shorter than the minimum length of 10500bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)
3 were dropped because they were longer than the maximum length of 10700bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)
6 strains passed all filters

0 comments on commit 7197b9f

Please sign in to comment.