diff --git a/augur/filter.py b/augur/filter.py index d98559b4b..7ef1d789e 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -100,7 +100,9 @@ def register_arguments(parser): subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category") subsample_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences") parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist") - parser.add_argument('--probabilistic-sampling', action='store_true', help="Sample probabilitically from groups -- useful when there are more groups than requested sequences") + probabilistic_sampling_group = parser.add_mutually_exclusive_group() + probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.") + probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling') parser.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.") parser.add_argument('--exclude-where', nargs='+', help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND") @@ -111,6 +113,7 @@ def register_arguments(parser): parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.") parser.add_argument('--output', '-o', help="output file", required=True) + parser.set_defaults(probabilistic_sampling=True) def run(args): ''' diff --git a/tests/builds/tb/Snakefile b/tests/builds/tb/Snakefile index 8f342c2af..7654d0e99 100644 --- a/tests/builds/tb/Snakefile +++ b/tests/builds/tb/Snakefile @@ -49,6 +49,7 @@ rule filter: --exclude {input.exclude} \ --group-by {params.group_by} \ --sequences-per-group {params.sequences_per_group} \ + --no-probabilistic-sampling """ rule mask: diff --git a/tests/builds/various_export_settings/base.snakefile b/tests/builds/various_export_settings/base.snakefile index 6eedfe6a1..ec0e3bc61 100644 --- a/tests/builds/various_export_settings/base.snakefile +++ b/tests/builds/various_export_settings/base.snakefile @@ -50,6 +50,7 @@ rule filter: --output {output.sequences} \ --group-by {params.group_by} \ --sequences-per-group {params.sequences_per_group} \ + --no-probabilistic-sampling \ --min-date {params.min_date} """ @@ -112,4 +113,3 @@ rule refine: --date-inference {params.date_inference} \ --clock-filter-iqd {params.clock_filter_iqd} """ - diff --git a/tests/builds/zika.t b/tests/builds/zika.t index 900e92c2b..73f2199b4 100644 --- a/tests/builds/zika.t +++ b/tests/builds/zika.t @@ -30,6 +30,7 @@ Filter sequences by a minimum date and an exclusion list and only keep one seque > --group-by country year month \ > --sequences-per-group 1 \ > --subsample-seed 314159 \ + > --no-probabilistic-sampling \ > --min-date 2012 > /dev/null $ diff -u "results/filtered.fasta" "$TMP/out/filtered.fasta" @@ -178,4 +179,4 @@ Export JSON files as v2 auspice outputs. Switch back to the original directory where testing started. - $ popd > /dev/null \ No newline at end of file + $ popd > /dev/null diff --git a/tests/builds/zika/Snakefile b/tests/builds/zika/Snakefile deleted file mode 100644 index d3d2a64e2..000000000 --- a/tests/builds/zika/Snakefile +++ /dev/null @@ -1,279 +0,0 @@ -rule all: - input: - auspice_tip_frequencies = "auspice/zika_tip-frequencies.json", - auspice_v1_tree = "auspice/v1_zika_tree.json", - auspice_v1_meta = "auspice/v1_zika_meta.json", - auspice_v1_seq = "auspice/v1_zika_seq.json", - auspice_v1_tip_frequencies = "auspice/v1_zika_tip-frequencies.json", - auspice_v2 = "auspice/v2_zika.json", - auspice_v2_tip_frequencies = "auspice/v2_zika_tip-frequencies.json" -rule files: - params: - input_fasta = "data/zika.fasta", - dropped_strains = "config/dropped_strains.txt", - reference = "config/zika_outgroup.gb", - colors = "config/colors.tsv", - auspice_config_v1 = "config/auspice_config_v1.json", - auspice_config_v2 = "config/auspice_config_v2.json" - -files = rules.files.params - -rule parse: - input: - sequences = files.input_fasta - output: - sequences = "results/sequences.fasta", - metadata = "results/metadata.tsv" - params: - fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url", - prettify_fields = "region country division city" - shell: - """ - augur parse \ - --sequences {input.sequences} \ - --output-sequences {output.sequences} \ - --output-metadata {output.metadata} \ - --fields {params.fasta_fields} \ - --prettify-fields {params.prettify_fields} - """ - -rule filter: - message: "Filtering sequences" - input: - sequences = rules.parse.output.sequences, - metadata = rules.parse.output.metadata, - exclude = files.dropped_strains - output: - sequences = "results/filtered.fasta" - params: - group_by = "country year month", - sequences_per_group = 1, - min_date = 2012 - shell: - """ - augur filter \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --exclude {input.exclude} \ - --output {output.sequences} \ - --group-by {params.group_by} \ - --sequences-per-group {params.sequences_per_group} \ - --subsample-seed 314159 \ - --min-date {params.min_date} - """ - -rule align: - message: "Aligning sequences" - input: - sequences = rules.filter.output.sequences, - reference = files.reference - output: - alignment = "results/aligned.fasta" - shell: - """ - augur align \ - --sequences {input.sequences} \ - --reference-sequence {input.reference} \ - --output {output.alignment} \ - --fill-gaps - """ - -rule tree: - message: "Building tree" - input: - alignment = rules.align.output.alignment - output: - tree = "results/tree_raw.nwk" - params: - method = "iqtree" - shell: - """ - augur tree \ - --alignment {input.alignment} \ - --output {output.tree} \ - --method {params.method} \ - --tree-builder-args "-seed 314159" - """ - -rule refine: - message: "Refining tree" - input: - tree = rules.tree.output.tree, - alignment = rules.align.output, - metadata = rules.parse.output.metadata - output: - tree = "results/tree.nwk", - node_data = "results/branch_lengths.json" - params: - coalescent = "opt", - date_inference = "marginal", - clock_filter_iqd = 4 - shell: - """ - augur refine \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --metadata {input.metadata} \ - --output-tree {output.tree} \ - --output-node-data {output.node_data} \ - --timetree \ - --coalescent {params.coalescent} \ - --date-confidence \ - --date-inference {params.date_inference} \ - --clock-filter-iqd {params.clock_filter_iqd} \ - --seed 314159 - """ - -rule ancestral: - message: "Reconstructing ancestral sequences and mutations" - input: - tree = rules.refine.output.tree, - alignment = rules.align.output - output: - node_data = "results/nt_muts.json" - params: - inference = "joint" - shell: - """ - augur ancestral \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --infer-ambiguous \ - --output-node-data {output.node_data} \ - --inference {params.inference} - """ - -rule translate: - message: "Translating amino acid sequences" - input: - tree = rules.refine.output.tree, - node_data = rules.ancestral.output.node_data, - reference = files.reference - output: - node_data = "results/aa_muts.json" - shell: - """ - augur translate \ - --tree {input.tree} \ - --ancestral-sequences {input.node_data} \ - --reference-sequence {input.reference} \ - --output-node-data {output.node_data} \ - """ - -rule traits: - message: "Inferring ancestral traits" - input: - tree = rules.refine.output.tree, - metadata = rules.parse.output.metadata - output: - node_data = "results/traits.json", - params: - columns = "country region", - sampling_bias_correction = 3, - weights = "config/trait_weights.csv" - shell: - """ - augur traits \ - --tree {input.tree} \ - --weights {params.weights} \ - --metadata {input.metadata} \ - --output-node-data {output.node_data} \ - --columns {params.columns} \ - --sampling-bias-correction {params.sampling_bias_correction} \ - --confidence - """ - -rule export_v1: - message: "Exporting data files for for auspice using nextflu compatible schema" - input: - tree = rules.refine.output.tree, - metadata = rules.parse.output.metadata, - branch_lengths = rules.refine.output.node_data, - traits = rules.traits.output.node_data, - nt_muts = rules.ancestral.output.node_data, - aa_muts = rules.translate.output.node_data, - colors = files.colors, - auspice_config = files.auspice_config_v1 - output: - auspice_tree = rules.all.input.auspice_v1_tree, - auspice_meta = rules.all.input.auspice_v1_meta, - auspice_seq = rules.all.input.auspice_v1_seq - shell: - """ - augur export v1 \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \ - --colors {input.colors} \ - --auspice-config {input.auspice_config} \ - --output-tree {output.auspice_tree} \ - --output-meta {output.auspice_meta} \ - --output-sequence {output.auspice_seq} - augur validate export-v1 {output.auspice_meta} {output.auspice_tree} - """ - -rule export_v2: - message: "Exporting data files for for auspice using nextstrain schema v2" - input: - tree = rules.refine.output.tree, - metadata = rules.parse.output.metadata, - branch_lengths = rules.refine.output.node_data, - traits = rules.traits.output.node_data, - nt_muts = rules.ancestral.output.node_data, - aa_muts = rules.translate.output.node_data, - auspice_config = files.auspice_config_v2, - colors = files.colors - params: - title = "Real-time tracking of Zika virus evolution -- v2 JSON" - output: - auspice_main = rules.all.input.auspice_v2 - shell: - """ - augur export v2 \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \ - --colors {input.colors} \ - --auspice-config {input.auspice_config} \ - --output {output.auspice_main} \ - --title {params.title:q} \ - --panels tree map entropy frequencies - """ - -rule tip_frequencies: - input: - tree = rules.refine.output.tree, - metadata = rules.parse.output.metadata - params: - pivot_interval = 3 - output: - tip_freq = rules.all.input.auspice_tip_frequencies - shell: - """ - augur frequencies \ - --method kde \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --pivot-interval {params.pivot_interval} \ - --output {output} - """ - -rule copy_tip_frequencies: - input: - frequencies = rules.tip_frequencies.output.tip_freq - output: - v1 = rules.all.input.auspice_v1_tip_frequencies, - v2 = rules.all.input.auspice_v2_tip_frequencies - shell: - """ - cp -f {input} {output.v1}; - cp -f {input} {output.v2}; - """ - -rule clean: - message: "Removing directories: {params}" - params: - "results ", - "auspice" - shell: - "rm -rfv {params}" diff --git a/tests/functional/filter.t b/tests/functional/filter.t index 66a53971b..3cc4999f2 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -13,13 +13,14 @@ With 10 groups to subsample from, this should produce one sequence per group. > --group-by country year month \ > --subsample-max-sequences 10 \ > --subsample-seed 314159 \ + > --no-probabilistic-sampling \ > --output "$TMP/filtered.fasta" > /dev/null $ grep ">" "$TMP/filtered.fasta" | wc -l - 10 + \s*10 (re) $ rm -f "$TMP/filtered.fasta" Try to filter with subsampling when there are more available groups than requested sequences. -This should fail. +This should fail, as probabilistic sampling is explicitly disabled. $ ${AUGUR} filter \ > --sequences filter/sequences.fasta \ @@ -28,12 +29,13 @@ This should fail. > --group-by country year month \ > --subsample-max-sequences 5 \ > --subsample-seed 314159 \ + > --no-probabilistic-sampling \ > --output "$TMP/filtered.fasta" ERROR: Asked to provide at most 5 sequences, but there are 10 groups. [1] $ rm -f "$TMP/filtered.fasta" -Use probabilistic subsampling to handle the case when there are more available groups than requested sequences. +Explicitly use probabilistic subsampling to handle the case when there are more available groups than requested sequences. $ ${AUGUR} filter \ > --sequences filter/sequences.fasta \ @@ -45,3 +47,15 @@ Use probabilistic subsampling to handle the case when there are more available g > --probabilistic-sampling \ > --output "$TMP/filtered.fasta" > /dev/null $ rm -f "$TMP/filtered.fasta" + +Using the default probabilistic subsampling, should work the same as the previous case. + + $ ${AUGUR} filter \ + > --sequences filter/sequences.fasta \ + > --metadata filter/metadata.tsv \ + > --min-date 2012 \ + > --group-by country year month \ + > --subsample-max-sequences 5 \ + > --subsample-seed 314159 \ + > --output "$TMP/filtered.fasta" > /dev/null + $ rm -f "$TMP/filtered.fasta"