From b8fb8ecc67c5e4a63ba3feec0baaaeca6ed5617e Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:53:56 -0700 Subject: [PATCH 1/4] Use args.output_sequences This makes it easier to search for usage, since `args.output` also matches other output options. --- augur/filter/__init__.py | 2 +- augur/filter/_run.py | 8 ++++---- augur/filter/io.py | 4 ++-- augur/filter/validate_arguments.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index 2c6c9d7db..bd3981997 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -102,7 +102,7 @@ def register_arguments(parser): subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).") output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required") - output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format") + output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format", dest="output_sequences") output_group.add_argument('--output-metadata', help="metadata for strains that passed filters") output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)") output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.") diff --git a/augur/filter/_run.py b/augur/filter/_run.py index cca387f13..0ddac4f93 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -376,10 +376,10 @@ def run(args): # to update the set of strains to keep based on which strains are actually # available. if is_vcf: - if args.output: + if args.output_sequences: # Get the samples to be deleted, not to keep, for VCF dropped_samps = list(sequence_strains - valid_strains) - write_vcf(args.sequences, args.output, dropped_samps) + write_vcf(args.sequences, args.output_sequences, dropped_samps) elif args.sequences: sequences = read_sequences(args.sequences) @@ -388,9 +388,9 @@ def run(args): # Even if we aren't emitting sequences, we track the observed strain # names in the sequence file as part of the single pass to allow # comparison with the provided sequence index. - if args.output: + if args.output_sequences: observed_sequence_strains = set() - with open_file(args.output, "wt") as output_handle: + with open_file(args.output_sequences, "wt") as output_handle: for sequence in sequences: observed_sequence_strains.add(sequence.id) diff --git a/augur/filter/io.py b/augur/filter/io.py index 0e03b12e5..8ed8c409d 100644 --- a/augur/filter/io.py +++ b/augur/filter/io.py @@ -160,8 +160,8 @@ def column_type_pair(input: str): def cleanup_outputs(args): """Remove output files. Useful when terminating midway through a loop of metadata chunks.""" - if args.output: - _try_remove(args.output) + if args.output_sequences: + _try_remove(args.output_sequences) if args.output_metadata: _try_remove(args.output_metadata) if args.output_strains: diff --git a/augur/filter/validate_arguments.py b/augur/filter/validate_arguments.py index 9e639a83d..3bce8d5d9 100644 --- a/augur/filter/validate_arguments.py +++ b/augur/filter/validate_arguments.py @@ -19,11 +19,11 @@ def validate_arguments(args): Parsed arguments from argparse """ # Don't allow sequence output when no sequence input is provided. - if args.output and not args.sequences: + if args.output_sequences and not args.sequences: raise AugurError("You need to provide sequences to output sequences.") # Confirm that at least one output was requested. - if not any((args.output, args.output_metadata, args.output_strains)): + if not any((args.output_sequences, args.output_metadata, args.output_strains)): raise AugurError("You need to select at least one output.") # Don't allow filtering on sequence-based information, if no sequences or From febd90e7a98a1d93bb31e68db7151bb6aff17a96 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:24:46 -0700 Subject: [PATCH 2/4] Update tests to use --output-sequences Planning for upcoming deprecation. --- .../filter/cram/filter-output-strains-no-sequence-error.t | 2 +- tests/functional/filter/cram/filter-sequences-vcf.t | 2 +- .../subsample-max-sequences-no-probabilistic-sampling-error.t | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t index e6e6a256f..996332ee9 100644 --- a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t +++ b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t @@ -9,6 +9,6 @@ This should fail. > --sequence-index "$TESTDIR/../data/sequence_index.tsv" \ > --metadata "$TESTDIR/../data/metadata.tsv" \ > --min-length 10000 \ - > --output filtered.fasta > /dev/null + > --output-sequences filtered.fasta > /dev/null ERROR: You need to provide sequences to output sequences. [2] diff --git a/tests/functional/filter/cram/filter-sequences-vcf.t b/tests/functional/filter/cram/filter-sequences-vcf.t index f6ddaa6cb..f5d362154 100644 --- a/tests/functional/filter/cram/filter-sequences-vcf.t +++ b/tests/functional/filter/cram/filter-sequences-vcf.t @@ -8,7 +8,7 @@ Filter TB strains from VCF and save as a list of filtered strains. > --sequences "$TESTDIR/../data/tb.vcf.gz" \ > --metadata "$TESTDIR/../data/tb_metadata.tsv" \ > --min-date 2012 \ - > --output filtered.vcf \ + > --output-sequences filtered.vcf \ > --output-strains filtered_strains.txt > /dev/null Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. 162 strains were dropped during filtering diff --git a/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t b/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t index 97bc3f428..9ae3fa8b1 100644 --- a/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t +++ b/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t @@ -13,6 +13,6 @@ This should fail, as probabilistic sampling is explicitly disabled. > --subsample-max-sequences 5 \ > --subsample-seed 314159 \ > --no-probabilistic-sampling \ - > --output filtered.fasta + > --output-sequences filtered.fasta ERROR: Asked to provide at most 5 sequences, but there are 8 groups. [2] From 9f7e2d93ea7fc34b0e9b2ce7c857219b13466bea Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:25:28 -0700 Subject: [PATCH 3/4] Deprecate --output and -o - Put options in a new argument group for deprecated options - Add entry to DEPRECATED.md - Add test to ensure deprecated options still work --- DEPRECATED.md | 7 +++ augur/filter/__init__.py | 18 +++++- .../filter/cram/filter-deprecated-options.t | 56 +++++++++++++++++++ 3 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 tests/functional/filter/cram/filter-deprecated-options.t diff --git a/DEPRECATED.md b/DEPRECATED.md index 090326fe8..02af814c6 100644 --- a/DEPRECATED.md +++ b/DEPRECATED.md @@ -4,6 +4,13 @@ These features are deprecated, which means they are no longer maintained and will go away in a future major version of Augur. They are currently still available for backwards compatibility, but should not be used in new code. +## `augur filter` options `--output` and `-o` + +*Deprecated in version __NEXT__. Planned for removal February 2025 or after.* + +`--output` and `-o` are aliases to `--output-sequences`. They will be removed to +avoid confusion with other output options. + ## `xopen` major version 1 *Deprecated in version __NEXT__ (July 2024). Planned for removal at the earliest with major version 26 not before November 2024* diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index bd3981997..ad404a03c 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -1,10 +1,11 @@ """ Filter and subsample a sequence set. """ -from augur.argparse_ import ExtendOverwriteDefault +from augur.argparse_ import ExtendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT from augur.filter.io import ACCEPTED_TYPES, column_type_pair from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN +from augur.io.print import print_err from augur.types import EmptyOutputReportingMethod from . import constants @@ -101,8 +102,8 @@ def register_arguments(parser): Since priorities represent relative values between strains, these values can be arbitrary.""") subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).") - output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required") - output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format", dest="output_sequences") + output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output-sequences, --output-metadata, --output-strains) is required") + output_group.add_argument('--output-sequences', help="filtered sequences in FASTA format") output_group.add_argument('--output-metadata', help="metadata for strains that passed filters") output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)") output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.") @@ -114,6 +115,10 @@ def register_arguments(parser): default=EmptyOutputReportingMethod.ERROR, help="How should empty outputs be reported when no strains pass filtering and/or subsampling.") + deprecated_group = parser.add_argument_group("deprecated", "options to be removed in a future major version") + deprecated_group.add_argument('--output', metavar="FILE", help="alias to --output-sequences" + SKIP_AUTO_DEFAULT_IN_HELP) + deprecated_group.add_argument('-o', metavar="FILE", help="alias to --output-sequences" + SKIP_AUTO_DEFAULT_IN_HELP) + parser.set_defaults(probabilistic_sampling=True) @@ -127,6 +132,13 @@ def run(args): ''' filter and subsample a set of sequences into an analysis set ''' + if args.o: + print_err("WARNING: -o is deprecated. Use --output-sequences instead.") + args.output_sequences = args.o + if args.output: + print_err("WARNING: --output is deprecated. Use --output-sequences instead.") + args.output_sequences = args.output + from .validate_arguments import validate_arguments # Validate arguments before attempting any I/O. validate_arguments(args) diff --git a/tests/functional/filter/cram/filter-deprecated-options.t b/tests/functional/filter/cram/filter-deprecated-options.t new file mode 100644 index 000000000..f59ce4a54 --- /dev/null +++ b/tests/functional/filter/cram/filter-deprecated-options.t @@ -0,0 +1,56 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Create files + + $ cat >metadata.tsv <<~~ + > strain col1 + > SEQ1 A + > SEQ2 B + > ~~ + + $ cat >sequences.fasta <<~~ + > >SEQ1 + > AAAA + > >SEQ2 + > AAAA + > >SEQ3 + > AAAA + > ~~ + +--output alias to --output-sequences + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --sequences sequences.fasta \ + > --output filtered.fasta + WARNING: --output is deprecated. Use --output-sequences instead. + Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + 1 strain was dropped during filtering + 1 had no metadata + 2 strains passed all filters + + $ cat filtered.fasta + >SEQ1 + AAAA + >SEQ2 + AAAA + +-o alias to --output-sequences + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --sequences sequences.fasta \ + > -o filtered.fasta + WARNING: -o is deprecated. Use --output-sequences instead. + Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + 1 strain was dropped during filtering + 1 had no metadata + 2 strains passed all filters + + $ cat filtered.fasta + >SEQ1 + AAAA + >SEQ2 + AAAA From e700ea3734c3c0acf9c1c4fd3c3fbb7f9ebe91c8 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:37:28 -0700 Subject: [PATCH 4/4] Update changelog --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 07f652c88..97e5ef2a0 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,9 +5,11 @@ ### Bug Fixes * filter: Previously, when `--subsample-max-sequences` was slightly lower than the number of groups, it was possible to fail with an uncaught `AssertionError`. Internal calculations have been adjusted to prevent this from happening. [#1588][] [#1598][] (@victorlin) +* filter: Options `--output` and `-o` have been deprecated and will now show a warning message. See [DEPRECATED.md](./DEPRECATED.md) for details. [#1622][] (@victorlin) [#1588]: https://github.com/nextstrain/augur/issues/1588 [#1598]: https://github.com/nextstrain/augur/issues/1598 +[#1622]: https://github.com/nextstrain/augur/pull/1622 ## 25.4.0 (3 September 2024)