diff --git a/CHANGES.md b/CHANGES.md index 98a373fa6..5ff51b16b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,9 +11,11 @@ * validation: we no longer exit with a non-zero exit code when the requested validation mode is "warn" [#1440][] (@jameshadfield) * validation: we no longer perform any validation when the requested validation mode is "skip" [#1440][] (@jameshadfield) +* filter: Send all log messages to `stderr`. This allows output to be written to `stdout` (e.g. `--output-strains /dev/stdout`). [#1459][] (@victorlin) [#1440]: https://github.com/nextstrain/augur/pull/1440 [#1445]: https://github.com/nextstrain/augur/pull/1445 +[#1459]: https://github.com/nextstrain/augur/pull/1459 ## 24.3.0 (18 March 2024) diff --git a/augur/filter/_run.py b/augur/filter/_run.py index 3cab70cae..9a82c7f00 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -277,9 +277,9 @@ def run(args): raise AugurError(error) if (probabilistic_used): - print(f"Sampling probabilistically at {sequences_per_group:0.4f} sequences per group, meaning it is possible to have more than the requested maximum of {args.subsample_max_sequences} sequences after filtering.") + print_err(f"Sampling probabilistically at {sequences_per_group:0.4f} sequences per group, meaning it is possible to have more than the requested maximum of {args.subsample_max_sequences} sequences after filtering.") else: - print(f"Sampling at {sequences_per_group} per group.") + print_err(f"Sampling at {sequences_per_group} per group.") if queues_by_group is None: # We know all of the possible groups now from the first pass through @@ -414,10 +414,10 @@ def run(args): total_strains_passed = len(valid_strains) total_strains_filtered = len(metadata_strains) + num_excluded_by_lack_of_metadata - total_strains_passed - print(f"{total_strains_filtered} {'strain was' if total_strains_filtered == 1 else 'strains were'} dropped during filtering") + print_err(f"{total_strains_filtered} {'strain was' if total_strains_filtered == 1 else 'strains were'} dropped during filtering") if num_excluded_by_lack_of_metadata: - print(f"\t{num_excluded_by_lack_of_metadata} had no metadata") + print_err(f"\t{num_excluded_by_lack_of_metadata} had no metadata") report_template_by_filter_name = { include_exclude_rules.filter_by_sequence_index.__name__: "{count} had no sequence data", @@ -446,11 +446,11 @@ def run(args): parameters["count"] = count parameters["were"] = "was" if count == 1 else "were" parameters["they"] = "it" if count == 1 else "they" - print("\t" + report_template_by_filter_name[filter_name].format(**parameters)) + print_err("\t" + report_template_by_filter_name[filter_name].format(**parameters)) if (group_by and args.sequences_per_group) or args.subsample_max_sequences: seed_txt = ", using seed {}".format(args.subsample_seed) if args.subsample_seed else "" - print(f"\t{num_excluded_subsamp} {'was' if num_excluded_subsamp == 1 else 'were'} dropped because of subsampling criteria{seed_txt}") + print_err(f"\t{num_excluded_subsamp} {'was' if num_excluded_subsamp == 1 else 'were'} dropped because of subsampling criteria{seed_txt}") if total_strains_passed == 0: empty_results_message = "All samples have been dropped! Check filter rules and metadata file format." @@ -463,4 +463,4 @@ def run(args): else: raise ValueError(f"Encountered unhandled --empty-output-reporting method {args.empty_output_reporting!r}") - print(f"{total_strains_passed} {'strain' if total_strains_passed == 1 else 'strains'} passed all filters") + print_err(f"{total_strains_passed} {'strain' if total_strains_passed == 1 else 'strains'} passed all filters") diff --git a/augur/io/print.py b/augur/io/print.py index aff911db6..6ba31aab5 100644 --- a/augur/io/print.py +++ b/augur/io/print.py @@ -2,4 +2,6 @@ def print_err(*args): + """Print to stderr. When data goes to stdout (most cases), this should be + used for any informational messages, not just errors/warnings.""" print(*args, file=sys.stderr) diff --git a/tests/functional/filter/cram/filter-empty-output-reporting.t b/tests/functional/filter/cram/filter-empty-output-reporting.t index 5f90a8f62..61a7270f0 100644 --- a/tests/functional/filter/cram/filter-empty-output-reporting.t +++ b/tests/functional/filter/cram/filter-empty-output-reporting.t @@ -10,26 +10,30 @@ Test the default behavior for empty results is an error. > --metadata "$TESTDIR/../data/metadata.tsv" \ > --exclude-all \ > --output-strains filtered_strains.txt > /dev/null + 12 strains were dropped during filtering + 12 were dropped by `--exclude-all` ERROR: All samples have been dropped! Check filter rules and metadata file format. [2] $ wc -l filtered_strains.txt \s*0 .* (re) Repeat with the --empty-output-reporting=warn option. -This whould output a warning message but no error. +This should output a warning message but no error. $ ${AUGUR} filter \ > --metadata "$TESTDIR/../data/metadata.tsv" \ > --exclude-all \ > --output-strains filtered_strains.txt \ > --empty-output-reporting warn > /dev/null + 12 strains were dropped during filtering + 12 were dropped by `--exclude-all` WARNING: All samples have been dropped! Check filter rules and metadata file format. + 0 strains passed all filters $ wc -l filtered_strains.txt \s*0 .* (re) Ignore empty results with the --empty-output-reporting=silent option. Make sure all 3 output types are empty, except the metadata output should still include the header. -This should not output any messages to stderr. $ ${AUGUR} filter \ > --metadata "$TESTDIR/../data/metadata.tsv" \ @@ -38,7 +42,7 @@ This should not output any messages to stderr. > --output-sequences filtered_seqs.fasta \ > --output-metadata filtered_metadata.tsv \ > --output-strains filtered_strains.txt \ - > --empty-output-reporting silent > /dev/null + > --empty-output-reporting silent 2>/dev/null $ wc -l filtered_seqs.fasta \s*0 .* (re) $ diff <(head -n 1 filtered_metadata.tsv) <(head -n 1 "$TESTDIR/../data/metadata.tsv") diff --git a/tests/functional/filter/cram/filter-exclude-include.t b/tests/functional/filter/cram/filter-exclude-include.t index 1fa508bce..2fd9acb7b 100644 --- a/tests/functional/filter/cram/filter-exclude-include.t +++ b/tests/functional/filter/cram/filter-exclude-include.t @@ -10,6 +10,6 @@ Force include one South American record by country to get two total records. > --metadata "$TESTDIR/../data/metadata.tsv" \ > --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \ > --include-where "country=Ecuador" \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ wc -l filtered_strains.txt \s*2 .* (re) diff --git a/tests/functional/filter/cram/filter-exclude-where-multiple.t b/tests/functional/filter/cram/filter-exclude-where-multiple.t index a9676d914..d96a03ad2 100644 --- a/tests/functional/filter/cram/filter-exclude-where-multiple.t +++ b/tests/functional/filter/cram/filter-exclude-where-multiple.t @@ -16,7 +16,7 @@ Scenario 1: Run command with one --exclude-where flag and multiple values $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --exclude-where "region=A" "region=B" \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null Both exclusions are applied. @@ -30,7 +30,7 @@ Scenario 2: Run command with two --exclude-where flags > --metadata metadata.tsv \ > --exclude-where "region=A" \ > --exclude-where "region=B" \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null Both exclusions are applied. diff --git a/tests/functional/filter/cram/filter-force-include-no-duplicates.t b/tests/functional/filter/cram/filter-force-include-no-duplicates.t index 5e5f4dacc..2dbca85f3 100644 --- a/tests/functional/filter/cram/filter-force-include-no-duplicates.t +++ b/tests/functional/filter/cram/filter-force-include-no-duplicates.t @@ -37,7 +37,7 @@ Test all outputs with --include-where. > --output-metadata metadata-filtered.tsv \ > --output-strains strains-filtered.txt \ > --output-sequences sequences-filtered.fasta \ - > > /dev/null 2>&1 + > 2>/dev/null $ cat metadata-filtered.tsv | tail -n+2 | sort -k1 a\t1 (esc) b\t2 (esc) @@ -72,7 +72,7 @@ Test all outputs with --include. > --output-metadata metadata-filtered.tsv \ > --output-strains strains-filtered.txt \ > --output-sequences sequences-filtered.fasta \ - > > /dev/null 2>&1 + > 2>/dev/null $ cat metadata-filtered.tsv | tail -n+2 | sort -k1 a\t1 (esc) b\t2 (esc) diff --git a/tests/functional/filter/cram/filter-max-date.t b/tests/functional/filter/cram/filter-max-date.t index 960ebc31b..5bea06dc0 100644 --- a/tests/functional/filter/cram/filter-max-date.t +++ b/tests/functional/filter/cram/filter-max-date.t @@ -16,7 +16,7 @@ Test that --max-date is inclusive. $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --max-date 2020-03-01 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 SEQ_2 diff --git a/tests/functional/filter/cram/filter-metadata-date-formats.t b/tests/functional/filter/cram/filter-metadata-date-formats.t index 7203fbae6..b267be762 100644 --- a/tests/functional/filter/cram/filter-metadata-date-formats.t +++ b/tests/functional/filter/cram/filter-metadata-date-formats.t @@ -16,7 +16,7 @@ Test that 2020 is evaluated as 2020-XX-XX. $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --min-date 2020-02-01 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_2 SEQ_3 @@ -26,7 +26,7 @@ Test that 2020.0, 2020, and 2020-XX-XX all pass --min-date 2019 $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --min-date 2019 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 SEQ_2 diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t index 971a6c5e6..324dc9d9f 100644 --- a/tests/functional/filter/cram/filter-metadata-delimiter.t +++ b/tests/functional/filter/cram/filter-metadata-delimiter.t @@ -13,7 +13,7 @@ Comma-delimited metadata is allowed by default. However, the output metadata wil $ ${AUGUR} filter \ > --metadata metadata.txt \ > --exclude-where column=A \ - > --output-metadata filtered.txt > /dev/null + > --output-metadata filtered.txt 2>/dev/null $ cat filtered.txt strain\tcolumn (esc) SEQ_2\tB (esc) @@ -62,7 +62,7 @@ Allow colon-delimited metadata. However, the output metadata will be tab-delimit > --metadata metadata.txt \ > --metadata-delimiters ':' \ > --exclude-where column=A \ - > --output-metadata filtered.txt > /dev/null + > --output-metadata filtered.txt 2>/dev/null $ cat filtered.txt strain\tcolumn (esc) SEQ_2\tB (esc) diff --git a/tests/functional/filter/cram/filter-min-date.t b/tests/functional/filter/cram/filter-min-date.t index ea13bb0fc..7548fb7ec 100644 --- a/tests/functional/filter/cram/filter-min-date.t +++ b/tests/functional/filter/cram/filter-min-date.t @@ -16,7 +16,7 @@ Test that --min-date is inclusive. $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --min-date 2020-02-26 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 SEQ_2 diff --git a/tests/functional/filter/cram/filter-mismatched-sequences-error.t b/tests/functional/filter/cram/filter-mismatched-sequences-error.t index 1c92681ae..5930234fa 100644 --- a/tests/functional/filter/cram/filter-mismatched-sequences-error.t +++ b/tests/functional/filter/cram/filter-mismatched-sequences-error.t @@ -13,6 +13,9 @@ This should produce no results because the intersection of metadata and sequence > --max-date 2020-01-30 \ > --output-strains filtered_strains.txt > /dev/null Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + 13 strains were dropped during filtering + 1 had no metadata + 12 had no sequence data ERROR: All samples have been dropped! Check filter rules and metadata file format. [2] $ wc -l filtered_strains.txt @@ -27,6 +30,9 @@ Repeat with sequence and strain outputs. We should get the same results. > --output-strains filtered_strains.txt \ > --output-sequences filtered.fasta > /dev/null Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + 13 strains were dropped during filtering + 1 had no metadata + 12 had no sequence data ERROR: All samples have been dropped! Check filter rules and metadata file format. [2] $ wc -l filtered_strains.txt @@ -42,6 +48,9 @@ Since we expect metadata to be filtered by presence of strains in input sequence > --metadata "$TESTDIR/../data/metadata.tsv" \ > --output-strains filtered_strains.txt > /dev/null Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + 13 strains were dropped during filtering + 1 had no metadata + 12 had no sequence data ERROR: All samples have been dropped! Check filter rules and metadata file format. [2] $ wc -l filtered_strains.txt diff --git a/tests/functional/filter/cram/filter-numerical-ids.t b/tests/functional/filter/cram/filter-numerical-ids.t index f6167a962..66998d12f 100644 --- a/tests/functional/filter/cram/filter-numerical-ids.t +++ b/tests/functional/filter/cram/filter-numerical-ids.t @@ -25,7 +25,7 @@ Test that nothing is filtered out due to missing sequence data. > --metadata metadata.tsv \ > --sequences sequences.fasta \ > --output-strains filtered_strains.txt \ - > > /dev/null 2>&1 + > 2>/dev/null $ sort filtered_strains.txt 1 2 diff --git a/tests/functional/filter/cram/filter-output-contents.t b/tests/functional/filter/cram/filter-output-contents.t index 4f16002a4..a66ae4b09 100644 --- a/tests/functional/filter/cram/filter-output-contents.t +++ b/tests/functional/filter/cram/filter-output-contents.t @@ -15,7 +15,7 @@ The purpose of this test file is to check format and consistency among the > --no-probabilistic-sampling \ > --output-metadata filtered_metadata.tsv \ > --output-strains filtered_strains.txt \ - > --output filtered.fasta > /dev/null + > --output filtered.fasta 2>/dev/null Check that the header row is identical between input and output metadata. diff --git a/tests/functional/filter/cram/filter-output-metadata-header.t b/tests/functional/filter/cram/filter-output-metadata-header.t index 862977e33..15ff155e1 100644 --- a/tests/functional/filter/cram/filter-output-metadata-header.t +++ b/tests/functional/filter/cram/filter-output-metadata-header.t @@ -14,7 +14,7 @@ Quoted columns containing the tab delimiter are left unchanged. $ ${AUGUR} filter \ > --metadata metadata.tsv \ - > --output-metadata filtered_metadata.tsv > /dev/null + > --output-metadata filtered_metadata.tsv 2>/dev/null $ head -n 1 filtered_metadata.tsv strain "col 1" @@ -28,7 +28,7 @@ Quoted columns without the tab delimiter are stripped of the quotes. $ ${AUGUR} filter \ > --metadata metadata.tsv \ - > --output-metadata filtered_metadata.tsv > /dev/null + > --output-metadata filtered_metadata.tsv 2>/dev/null $ head -n 1 filtered_metadata.tsv strain col1 @@ -42,7 +42,7 @@ Any other columns with quotes are quoted, and pre-existing quotes are escsaped b $ ${AUGUR} filter \ > --metadata metadata.tsv \ - > --output-metadata filtered_metadata.tsv > /dev/null + > --output-metadata filtered_metadata.tsv 2>/dev/null $ head -n 1 filtered_metadata.tsv strain "col""1" "col2""" diff --git a/tests/functional/filter/cram/filter-query-and-include-where.t b/tests/functional/filter/cram/filter-query-and-include-where.t index 101b918aa..b61891978 100644 --- a/tests/functional/filter/cram/filter-query-and-include-where.t +++ b/tests/functional/filter/cram/filter-query-and-include-where.t @@ -17,7 +17,7 @@ Test that --include_where still works with filtering on query. > --metadata metadata.tsv \ > --query "quality=='good' & location=='colorado'" \ > --include-where "location=nevada" \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 SEQ_3 diff --git a/tests/functional/filter/cram/filter-query-and-include.t b/tests/functional/filter/cram/filter-query-and-include.t index aeb603c25..a691f365b 100644 --- a/tests/functional/filter/cram/filter-query-and-include.t +++ b/tests/functional/filter/cram/filter-query-and-include.t @@ -20,7 +20,7 @@ Test that --include_where still works with filtering on query. > --metadata metadata.tsv \ > --query "quality=='good' & location=='colorado'" \ > --include include.txt \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 SEQ_3 diff --git a/tests/functional/filter/cram/filter-query-backtick-quoting.t b/tests/functional/filter/cram/filter-query-backtick-quoting.t index 84c9e48a4..fd6157a39 100644 --- a/tests/functional/filter/cram/filter-query-backtick-quoting.t +++ b/tests/functional/filter/cram/filter-query-backtick-quoting.t @@ -17,7 +17,7 @@ The 'region name' column is query-able by backtick quoting. $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --query '(`region name` == "A")' \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 diff --git a/tests/functional/filter/cram/filter-query-example.t b/tests/functional/filter/cram/filter-query-example.t index 89f6e2a6e..e36e6e5e0 100644 --- a/tests/functional/filter/cram/filter-query-example.t +++ b/tests/functional/filter/cram/filter-query-example.t @@ -8,7 +8,7 @@ First, select strains from Brazil (there should be 1). $ ${AUGUR} filter \ > --metadata "$TESTDIR/../data/metadata.tsv" \ > --query "country == 'Brazil'" \ - > --output-strains filtered_strains.brazil.txt > /dev/null + > --output-strains filtered_strains.brazil.txt 2>/dev/null $ wc -l filtered_strains.brazil.txt \s*1 .* (re) @@ -17,7 +17,7 @@ Then, select strains from Colombia (there should be 3). $ ${AUGUR} filter \ > --metadata "$TESTDIR/../data/metadata.tsv" \ > --query "country == 'Colombia'" \ - > --output-strains filtered_strains.colombia.txt > /dev/null + > --output-strains filtered_strains.colombia.txt 2>/dev/null $ wc -l filtered_strains.colombia.txt \s*3 .* (re) @@ -29,7 +29,7 @@ Finally, exclude all sequences except those from the two sets of strains (there > --metadata "$TESTDIR/../data/metadata.tsv" \ > --exclude-all \ > --include filtered_strains.brazil.txt filtered_strains.colombia.txt \ - > --output filtered.fasta > /dev/null + > --output filtered.fasta 2>/dev/null $ grep "^>" filtered.fasta | wc -l \s*4 (re) @@ -42,7 +42,7 @@ We should get the same outputs without building a sequence index on the fly, bec > --exclude-all \ > --include filtered_strains.brazil.txt filtered_strains.colombia.txt \ > --output filtered.fasta \ - > --output-metadata filtered.tsv > /dev/null + > --output-metadata filtered.tsv 2>/dev/null $ grep "^>" filtered.fasta | wc -l \s*4 (re) @@ -58,6 +58,6 @@ Alternately, exclude the sequences from Brazil and Colombia (N=4) and records wi > --sequence-index "$TESTDIR/../data/sequence_index.tsv" \ > --metadata "$TESTDIR/../data/metadata.tsv" \ > --exclude filtered_strains.brazil.txt filtered_strains.colombia.txt \ - > --output filtered.fasta > /dev/null + > --output filtered.fasta 2>/dev/null $ grep "^>" filtered.fasta | wc -l \s*7 (re) diff --git a/tests/functional/filter/cram/filter-query-numerical.t b/tests/functional/filter/cram/filter-query-numerical.t index 5aeb142f8..875c04802 100644 --- a/tests/functional/filter/cram/filter-query-numerical.t +++ b/tests/functional/filter/cram/filter-query-numerical.t @@ -17,7 +17,7 @@ The 'coverage' column should be query-able by numerical comparisons. $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --query "coverage >= 0.95" \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_2 @@ -52,7 +52,7 @@ However, that is still possible by explicitly specifying that it is a string col > --metadata metadata.tsv \ > --query "coverage.str.endswith('.95')" \ > --query-columns coverage:str \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_2 diff --git a/tests/functional/filter/cram/filter-query-str.t b/tests/functional/filter/cram/filter-query-str.t index a6b11b231..6ada43eb9 100644 --- a/tests/functional/filter/cram/filter-query-str.t +++ b/tests/functional/filter/cram/filter-query-str.t @@ -16,7 +16,7 @@ Create metadata file for testing. $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --query "column.str.startswith('value')" \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 diff --git a/tests/functional/filter/cram/filter-sequences-vcf.t b/tests/functional/filter/cram/filter-sequences-vcf.t index bbc433196..f6ddaa6cb 100644 --- a/tests/functional/filter/cram/filter-sequences-vcf.t +++ b/tests/functional/filter/cram/filter-sequences-vcf.t @@ -11,6 +11,10 @@ Filter TB strains from VCF and save as a list of filtered strains. > --output filtered.vcf \ > --output-strains filtered_strains.txt > /dev/null Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + 162 strains were dropped during filtering + 155 had no sequence data + 7 were dropped because they were earlier than 2012.0 or missing a date + 3 strains passed all filters $ wc -l filtered_strains.txt \s*3 .* (re) diff --git a/tests/functional/filter/cram/filter-subsample-missing-date-parts.t b/tests/functional/filter/cram/filter-subsample-missing-date-parts.t index 810fac97a..c70e62213 100644 --- a/tests/functional/filter/cram/filter-subsample-missing-date-parts.t +++ b/tests/functional/filter/cram/filter-subsample-missing-date-parts.t @@ -20,6 +20,10 @@ track which records were skipped for which reasons. > --subsample-seed 0 \ > --output-log log.txt \ > --output-strains filtered_strains.txt > /dev/null + 2 strains were dropped during filtering + 1 was dropped during grouping due to ambiguous year information + 1 was dropped because of subsampling criteria + 1 strain passed all filters $ cat log.txt strain\tfilter\tkwargs (esc) SEQ_1\tskip_group_by_with_ambiguous_year\t"[[""date_column"", ""date""]]" (esc) @@ -37,6 +41,11 @@ month information in their date fields. > --subsample-seed 0 \ > --output-log log.txt \ > --output-strains filtered_strains.txt > /dev/null + 2 strains were dropped during filtering + 1 was dropped during grouping due to ambiguous year information + 1 was dropped during grouping due to ambiguous month information + 0 were dropped because of subsampling criteria + 1 strain passed all filters $ cat log.txt strain\tfilter\tkwargs (esc) SEQ_1\tskip_group_by_with_ambiguous_year\t"[[""date_column"", ""date""]]" (esc) diff --git a/tests/functional/filter/cram/subsample-5-sequences-without-group-by-no-probabilistic-sampling.t b/tests/functional/filter/cram/subsample-5-sequences-without-group-by-no-probabilistic-sampling.t index 5fc6c955f..ef3cfc0d1 100644 --- a/tests/functional/filter/cram/subsample-5-sequences-without-group-by-no-probabilistic-sampling.t +++ b/tests/functional/filter/cram/subsample-5-sequences-without-group-by-no-probabilistic-sampling.t @@ -13,6 +13,6 @@ This generates a dummy category and subsamples from there. With no-probabilistic > --subsample-max-sequences 5 \ > --subsample-seed 314159 \ > --no-probabilistic-sampling \ - > --output filtered.fasta > /dev/null + > --output filtered.fasta 2>/dev/null $ grep ">" filtered.fasta | wc -l \s*5 (re) diff --git a/tests/functional/filter/cram/subsample-8-sequences-no-probabilistic-sampling.t b/tests/functional/filter/cram/subsample-8-sequences-no-probabilistic-sampling.t index cd648093c..4c9655f2a 100644 --- a/tests/functional/filter/cram/subsample-8-sequences-no-probabilistic-sampling.t +++ b/tests/functional/filter/cram/subsample-8-sequences-no-probabilistic-sampling.t @@ -14,6 +14,6 @@ With 8 groups to subsample from (after filtering), this should produce one seque > --subsample-max-sequences 8 \ > --subsample-seed 314159 \ > --no-probabilistic-sampling \ - > --output filtered.fasta > /dev/null + > --output filtered.fasta 2>/dev/null $ grep ">" filtered.fasta | wc -l \s*8 (re) diff --git a/tests/functional/filter/cram/subsample-ambiguous-dates-error.t b/tests/functional/filter/cram/subsample-ambiguous-dates-error.t index ec4194a2a..10969be46 100644 --- a/tests/functional/filter/cram/subsample-ambiguous-dates-error.t +++ b/tests/functional/filter/cram/subsample-ambiguous-dates-error.t @@ -19,10 +19,10 @@ Metadata with ambiguous days on all strains should error when grouping by week. > --subsample-seed 0 \ > --output-metadata metadata-filtered.tsv \ > --output-log filtered_log.tsv - ERROR: All samples have been dropped! Check filter rules and metadata file format. 4 strains were dropped during filtering 4 were dropped during grouping due to ambiguous day information 0 were dropped because of subsampling criteria + ERROR: All samples have been dropped! Check filter rules and metadata file format. [2] $ cat filtered_log.tsv | grep "skip_group_by_with_ambiguous_day" | wc -l \s*4 (re) @@ -46,10 +46,10 @@ Metadata with ambiguous months on all strains should error when grouping by mont > --subsample-seed 0 \ > --output-metadata metadata-filtered.tsv \ > --output-log filtered_log.tsv - ERROR: All samples have been dropped! Check filter rules and metadata file format. 4 strains were dropped during filtering 4 were dropped during grouping due to ambiguous month information 0 were dropped because of subsampling criteria + ERROR: All samples have been dropped! Check filter rules and metadata file format. [2] $ cat filtered_log.tsv | grep "skip_group_by_with_ambiguous_month" | wc -l \s*4 (re) @@ -73,10 +73,10 @@ Metadata with ambiguous years on all strains should error when grouping by year. > --subsample-seed 0 \ > --output-metadata metadata-filtered.tsv \ > --output-log filtered_log.tsv - ERROR: All samples have been dropped! Check filter rules and metadata file format. 4 strains were dropped during filtering 4 were dropped during grouping due to ambiguous year information 0 were dropped because of subsampling criteria + ERROR: All samples have been dropped! Check filter rules and metadata file format. [2] $ cat filtered_log.tsv | grep "skip_group_by_with_ambiguous_year" | wc -l \s*4 (re) diff --git a/tests/functional/filter/cram/subsample-group-by-empty-value.t b/tests/functional/filter/cram/subsample-group-by-empty-value.t index 6a16e9f91..1fdbd4bd8 100644 --- a/tests/functional/filter/cram/subsample-group-by-empty-value.t +++ b/tests/functional/filter/cram/subsample-group-by-empty-value.t @@ -22,7 +22,7 @@ I.e. the groups here are: > --sequences-per-group 1 \ > --subsample-seed 0 \ > --output-log filtered-log.tsv \ - > --output-strains filtered-strains.txt > /dev/null + > --output-strains filtered-strains.txt 2>/dev/null $ cat filtered-strains.txt SEQ1 SEQ3 diff --git a/tests/functional/filter/cram/subsample-group-by-region-1-sequence-per-group-seed.t b/tests/functional/filter/cram/subsample-group-by-region-1-sequence-per-group-seed.t index cf56999ce..cecc84006 100644 --- a/tests/functional/filter/cram/subsample-group-by-region-1-sequence-per-group-seed.t +++ b/tests/functional/filter/cram/subsample-group-by-region-1-sequence-per-group-seed.t @@ -9,7 +9,7 @@ Filter with subsampling, requesting 1 sequence per group (for a group with 4 dis > --group-by region \ > --sequences-per-group 1 \ > --subsample-seed 314159 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ wc -l filtered_strains.txt \s*4 .* (re) @@ -20,6 +20,6 @@ By setting the subsample seed above, we should guarantee that we get the same "r > --group-by region \ > --sequences-per-group 1 \ > --subsample-seed 314159 \ - > --output-strains filtered_strains_repeated.txt > /dev/null + > --output-strains filtered_strains_repeated.txt 2>/dev/null $ diff -u <(sort filtered_strains.txt) <(sort filtered_strains_repeated.txt) diff --git a/tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t b/tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t index 0f3dfe1e5..7a33eaeb5 100644 --- a/tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t +++ b/tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t @@ -22,6 +22,9 @@ Group by generated year column, and ensure all original columns are still in the > --subsample-seed 0 \ > --output-metadata filtered_metadata.tsv > /dev/null WARNING: `--group-by year` uses a generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes. + 3 strains were dropped during filtering + 3 were dropped because of subsampling criteria + 2 strains passed all filters $ cat filtered_metadata.tsv strain\tdate\tyear\tmonth (esc) SEQ1\t2021-01-01\todd\tJanuary (esc) @@ -37,6 +40,9 @@ Group by generated year and month columns, and ensure all original columns are s > --output-metadata filtered_metadata.tsv > /dev/null WARNING: `--group-by month` uses a generated month value from the 'date' column. The custom 'month' column in the metadata is ignored for grouping purposes. WARNING: `--group-by year` uses a generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes. + 2 strains were dropped during filtering + 2 were dropped because of subsampling criteria + 3 strains passed all filters $ cat filtered_metadata.tsv strain\tdate\tyear\tmonth (esc) SEQ1\t2021-01-01\todd\tJanuary (esc) diff --git a/tests/functional/filter/cram/subsample-group-by-without-force-included-strains.t b/tests/functional/filter/cram/subsample-group-by-without-force-included-strains.t index 475d6635e..9c92eecc1 100644 --- a/tests/functional/filter/cram/subsample-group-by-without-force-included-strains.t +++ b/tests/functional/filter/cram/subsample-group-by-without-force-included-strains.t @@ -16,4 +16,4 @@ We don't filter these strains, so they could be considered for subsampling, but > --include include_old_strains.txt \ > --group-by month year \ > --subsample-max-sequences 10 \ - > --output-metadata metadata-filtered.tsv > /dev/null + > --output-metadata metadata-filtered.tsv 2>/dev/null diff --git a/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t b/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t index cb679e676..72ed589f2 100644 --- a/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t +++ b/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t @@ -15,6 +15,14 @@ Explicitly use probabilistic subsampling to handle the case when there are more > --probabilistic-sampling \ > --output-strains filtered_strains_probabilistic.txt > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 8 groups. + Sampling probabilistically at 0.6055 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + 10 strains were dropped during filtering + 1 had no metadata + 1 had no sequence data + 1 was dropped because it was earlier than 2012.0 or missing a date + 1 was dropped during grouping due to ambiguous month information + 6 were dropped because of subsampling criteria, using seed 314159 + 3 strains passed all filters Using the default probabilistic subsampling, should work the same as the previous case. @@ -28,6 +36,14 @@ Using the default probabilistic subsampling, should work the same as the previou > --subsample-seed 314159 \ > --output-strains filtered_strains_default.txt > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 8 groups. + Sampling probabilistically at 0.6055 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + 10 strains were dropped during filtering + 1 had no metadata + 1 had no sequence data + 1 was dropped because it was earlier than 2012.0 or missing a date + 1 was dropped during grouping due to ambiguous month information + 6 were dropped because of subsampling criteria, using seed 314159 + 3 strains passed all filters By setting the subsample seed above, we should get the same results for both runs. diff --git a/tests/functional/filter/cram/subsample-priority-file.t b/tests/functional/filter/cram/subsample-priority-file.t index 1801c8d0c..cb5c5058d 100644 --- a/tests/functional/filter/cram/subsample-priority-file.t +++ b/tests/functional/filter/cram/subsample-priority-file.t @@ -11,6 +11,6 @@ The two highest priority strains are in these two years. > --group-by year \ > --priority "$TESTDIR/../data/priorities.tsv" \ > --sequences-per-group 1 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ diff -u <(sort -k 2,2rn -k 1,1 "$TESTDIR/../data/priorities.tsv" | head -n 2 | cut -f 1) <(sort -k 1,1 filtered_strains.txt) diff --git a/tests/functional/filter/cram/subsample-priority-values.t b/tests/functional/filter/cram/subsample-priority-values.t index 00fb0cd94..4fc9e0781 100644 --- a/tests/functional/filter/cram/subsample-priority-values.t +++ b/tests/functional/filter/cram/subsample-priority-values.t @@ -27,7 +27,7 @@ not have a priority score. > --metadata metadata.tsv \ > --priority priorities.tsv \ > --subsample-max-sequences 5 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 SEQ_2 @@ -43,7 +43,7 @@ values). > --metadata metadata.tsv \ > --priority priorities.tsv \ > --subsample-max-sequences 4 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 SEQ_2 @@ -56,7 +56,7 @@ Subsample 1 less strain. SEQ_4 should now be dropped. > --metadata metadata.tsv \ > --priority priorities.tsv \ > --subsample-max-sequences 3 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_1 SEQ_2 @@ -68,7 +68,7 @@ Subsample 1 less strain. SEQ_1 should now be dropped. > --metadata metadata.tsv \ > --priority priorities.tsv \ > --subsample-max-sequences 2 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_2 SEQ_3 @@ -79,6 +79,6 @@ Subsample 1 less strain. SEQ_1 should now be dropped. > --metadata metadata.tsv \ > --priority priorities.tsv \ > --subsample-max-sequences 1 \ - > --output-strains filtered_strains.txt > /dev/null + > --output-strains filtered_strains.txt 2>/dev/null $ sort filtered_strains.txt SEQ_3 diff --git a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t index fa183d0ca..b638bdd59 100644 --- a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t +++ b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t @@ -9,9 +9,16 @@ Strains with ambiguous years or months should be dropped and logged. > --metadata "$TESTDIR/../data/metadata.tsv" \ > --group-by year month \ > --subsample-max-sequences 5 \ + > --subsample-seed 0 \ > --output-strains filtered_strains.txt \ > --output-log filtered_log.tsv > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 6 groups. + Sampling probabilistically at 0.8203 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + 8 strains were dropped during filtering + 1 was dropped during grouping due to ambiguous year information + 1 was dropped during grouping due to ambiguous month information + 6 were dropped because of subsampling criteria + 4 strains passed all filters $ grep "SG_018" filtered_log.tsv | cut -f 1-2 SG_018\tskip_group_by_with_ambiguous_month (esc) $ grep "COL/FLR_00024/2015" filtered_log.tsv | cut -f 1-2 @@ -36,7 +43,7 @@ Group by 'week'. Check the number of strains that have been dropped due to ambig > --sequences-per-group 1 \ > --subsample-seed 0 \ > --output-strains filtered_strains.txt \ - > --output-log filtered_log.tsv > /dev/null + > --output-log filtered_log.tsv 2>/dev/null $ grep "skip_group_by_with_ambiguous_year" filtered_log.tsv | wc -l \s*1 (re) $ grep "skip_group_by_with_ambiguous_month" filtered_log.tsv | wc -l