Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

filter: Add --max-length option #1429

Merged
merged 7 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

## __NEXT__

### Features

* filter: Added a new option `--max-length` to filter out sequences that are longer than a certain amount of base pairs. [#1429][] (@victorlin)

### Bug Fixes

* filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)

[#1425]: https://github.com/nextstrain/augur/pull/1425
[#1429]: https://github.com/nextstrain/augur/pull/1429

## 24.2.3 (23 February 2024)

Expand Down
1 change: 1 addition & 0 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def register_arguments(parser):

sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
sequence_filter_group.add_argument('--max-length', type=int, help="maximum length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")

subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
Expand Down
3 changes: 2 additions & 1 deletion augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,8 @@ def run(args):
include_exclude_rules.filter_by_ambiguous_date.__name__: "{count} {were} dropped because of their ambiguous date in {ambiguity}",
include_exclude_rules.filter_by_min_date.__name__: "{count} {were} dropped because {they} {were} earlier than {min_date} or missing a date",
include_exclude_rules.filter_by_max_date.__name__: "{count} {were} dropped because {they} {were} later than {max_date} or missing a date",
include_exclude_rules.filter_by_sequence_length.__name__: "{count} {were} dropped because {they} {were} shorter than minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
include_exclude_rules.filter_by_min_length.__name__: "{count} {were} dropped because {they} {were} shorter than the minimum length of {min_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
include_exclude_rules.filter_by_max_length.__name__: "{count} {were} dropped because {they} {were} longer than the maximum length of {max_length}bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)",
include_exclude_rules.filter_by_non_nucleotide.__name__: "{count} {were} dropped because {they} had non-nucleotide characters",
include_exclude_rules.skip_group_by_with_ambiguous_year.__name__: "{count} {were} dropped during grouping due to ambiguous year information",
include_exclude_rules.skip_group_by_with_ambiguous_month.__name__: "{count} {were} dropped during grouping due to ambiguous month information",
Expand Down
64 changes: 51 additions & 13 deletions augur/filter/include_exclude_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def filter_by_sequence_index(metadata, sequence_index) -> FilterFunctionReturn:
return metadata_strains & sequence_index_strains


def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFunctionReturn:
def filter_by_min_length(metadata, sequence_index, min_length) -> FilterFunctionReturn:
"""Filter metadata by sequence length from a given sequence index.

Parameters
Expand All @@ -440,13 +440,13 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun
--------
>>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
>>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
>>> filter_by_min_length(metadata, sequence_index, min_length=27000)
{'strain1'}

It is possible for the sequence index to be missing strains present in the metadata.

>>> sequence_index = pd.DataFrame([{"strain": "strain3", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
>>> filter_by_sequence_length(metadata, sequence_index, min_length=27000)
>>> filter_by_min_length(metadata, sequence_index, min_length=27000)
set()

"""
Expand All @@ -459,6 +459,34 @@ def filter_by_sequence_length(metadata, sequence_index, min_length) -> FilterFun
return set(filtered_sequence_index[filtered_sequence_index["ACGT"] >= min_length].index.values)


def filter_by_max_length(metadata, sequence_index, max_length) -> FilterFunctionReturn:
"""Filter metadata by sequence length from a given sequence index.

Parameters
----------
metadata : pandas.DataFrame
Metadata indexed by strain name
sequence_index : pandas.DataFrame
Sequence index
max_length : int
Maximum number of standard nucleotide characters (A, C, G, or T) in each sequence

Examples
--------
>>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain")
>>> filter_by_max_length(metadata, sequence_index, max_length=27000)
{'strain2'}
"""
strains = set(metadata.index.values)
filtered_sequence_index = sequence_index.loc[
sequence_index.index.intersection(strains)
]
filtered_sequence_index["ACGT"] = filtered_sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1)

return set(filtered_sequence_index[filtered_sequence_index["ACGT"] <= max_length].index.values)


def filter_by_non_nucleotide(metadata, sequence_index) -> FilterFunctionReturn:
"""Filter metadata for strains with invalid nucleotide content.

Expand Down Expand Up @@ -667,21 +695,31 @@ def construct_filters(args, sequence_index) -> Tuple[List[FilterOption], List[Fi
))

# Filter by sequence length.
# Skip VCF files and warn the user that length filters do not
# make sense for VCFs.
is_vcf = filename_is_vcf(args.sequences)
if args.min_length:
# Skip VCF files and warn the user that the min length filter does not
# make sense for VCFs.
is_vcf = filename_is_vcf(args.sequences)

if is_vcf: #doesn't make sense for VCF, ignore.
if is_vcf:
print_err("WARNING: Cannot use min_length for VCF files. Ignoring...")
else:
exclude_by.append((
filter_by_sequence_length,
filter_by_min_length,
{
"sequence_index": sequence_index,
"min_length": args.min_length,
}
))
if args.max_length:
if is_vcf:
print_err("WARNING: Cannot use max_length for VCF files. Ignoring...")
else:
exclude_by.append((
filter_by_max_length,
{
"sequence_index": sequence_index,
"max_length": args.max_length,
}
))

# Exclude sequences with non-nucleotide characters.
if args.non_nucleotide:
Expand Down Expand Up @@ -755,13 +793,13 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
annotated in a sequence index.

>>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}, {"strain": "strain3", "A": 1250, "C": 1250, "G": 1250, "T": 1250}]).set_index("strain")
>>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> include_by = [(force_include_where, {"include_where": "region=Europe"})]
>>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by)
>>> strains_to_keep
{'strain1'}
>>> sorted(strains_to_exclude, key=lambda record: record["strain"])
[{'strain': 'strain2', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}]
[{'strain': 'strain2', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_min_length', 'kwargs': '[["min_length", 27000]]'}]
>>> strains_to_include
[{'strain': 'strain2', 'filter': 'force_include_where', 'kwargs': '[["include_where", "region=Europe"]]'}]

Expand Down Expand Up @@ -847,9 +885,9 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs):
Examples
--------
>>> from augur.dates import numeric_date
>>> from augur.filter.include_exclude_rules import filter_by_sequence_length, filter_by_min_date
>>> from augur.filter.include_exclude_rules import filter_by_min_length, filter_by_min_date
>>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}, {"strain": "strain2", "ACGT": 26000}, {"strain": "strain3", "ACGT": 5000}]).set_index("strain")
>>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> exclude_by = [(filter_by_min_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> _filter_kwargs_to_str(exclude_by[0][1])
'[["min_length", 27000]]'
>>> exclude_by = [(filter_by_min_date, {"date_column": "date", "min_date": numeric_date("2020-03-01")})]
Expand Down
1 change: 1 addition & 0 deletions augur/filter/validate_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

SEQUENCE_ONLY_FILTERS = (
"min_length",
"max_length",
"non_nucleotide",
)

Expand Down

This file was deleted.

17 changes: 0 additions & 17 deletions tests/functional/filter/cram/filter-min-length-output-metadata.t

This file was deleted.

17 changes: 0 additions & 17 deletions tests/functional/filter/cram/filter-min-length-output-strains.t

This file was deleted.

28 changes: 28 additions & 0 deletions tests/functional/filter/cram/filter-no-sequence-index-error.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Setup

$ source "$TESTDIR"/_setup.sh

Try to filter using only metadata without a sequence index.

These should fail because the requested filters rely on sequence information.

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --min-length 10000 \
> --output-strains filtered_strains.txt > /dev/null
ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
[2]

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --max-length 10000 \
> --output-strains filtered_strains.txt > /dev/null
ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
[2]

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --non-nucleotide \
> --output-strains filtered_strains.txt > /dev/null
ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
[2]
18 changes: 18 additions & 0 deletions tests/functional/filter/cram/filter-sequence-length.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Setup

$ source "$TESTDIR"/_setup.sh

Filter using --min-length and --max-length.

$ ${AUGUR} filter \
> --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --min-length 10500 \
> --max-length 10700 \
> --output-strains filtered_strains.txt
7 strains were dropped during filtering
1 had no metadata
1 had no sequence data
2 were dropped because they were shorter than the minimum length of 10500bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)
3 were dropped because they were longer than the maximum length of 10700bp when only counting standard nucleotide characters A, C, G, or T (case-insensitive)
6 strains passed all filters
Loading