nextstrain · huddlej · Jan 20, 2021 · Jan 5, 2021
diff --git a/augur/filter.py b/augur/filter.py
@@ -100,7 +100,9 @@ def register_arguments(parser):
     subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
     subsample_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
     parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
-    parser.add_argument('--probabilistic-sampling', action='store_true', help="Sample probabilitically from groups -- useful when there are more groups than requested sequences")
+    probabilistic_sampling_group = parser.add_mutually_exclusive_group()
+    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
+    probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
     parser.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.")
     parser.add_argument('--exclude-where', nargs='+',
                                 help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
@@ -111,6 +113,7 @@ def register_arguments(parser):
     parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.")
     parser.add_argument('--output', '-o', help="output file", required=True)
 
+    parser.set_defaults(probabilistic_sampling=True)
 
 def run(args):
     '''

diff --git a/tests/builds/tb/Snakefile b/tests/builds/tb/Snakefile
@@ -49,6 +49,7 @@ rule filter:
             --exclude {input.exclude} \
             --group-by {params.group_by} \
             --sequences-per-group {params.sequences_per_group} \
+            --no-probabilistic-sampling
         """
 
 rule mask:

diff --git a/tests/builds/various_export_settings/base.snakefile b/tests/builds/various_export_settings/base.snakefile
@@ -50,6 +50,7 @@ rule filter:
             --output {output.sequences} \
             --group-by {params.group_by} \
             --sequences-per-group {params.sequences_per_group} \
+            --no-probabilistic-sampling \
             --min-date {params.min_date}
         """
 
@@ -112,4 +113,3 @@ rule refine:
             --date-inference {params.date_inference} \
             --clock-filter-iqd {params.clock_filter_iqd}
         """
-
diff --git a/tests/builds/zika.t b/tests/builds/zika.t
@@ -30,6 +30,7 @@ Filter sequences by a minimum date and an exclusion list and only keep one seque
   >   --group-by country year month \
   >   --sequences-per-group 1 \
   >   --subsample-seed 314159 \
+  >   --no-probabilistic-sampling \
   >   --min-date 2012 > /dev/null
 
   $ diff -u "results/filtered.fasta" "$TMP/out/filtered.fasta"
@@ -178,4 +179,4 @@ Export JSON files as v2 auspice outputs.
 
 Switch back to the original directory where testing started.
 
-  $ popd > /dev/null
+  $ popd > /dev/null
diff --git a/tests/builds/zika/Snakefile b/tests/builds/zika/Snakefile
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
@@ -13,13 +13,14 @@ With 10 groups to subsample from, this should produce one sequence per group.
   >  --group-by country year month \
   >  --subsample-max-sequences 10 \
   >  --subsample-seed 314159 \
+  >  --no-probabilistic-sampling \
   >  --output "$TMP/filtered.fasta" > /dev/null
   $ grep ">" "$TMP/filtered.fasta" | wc -l
-  10
+  \s*10 (re)
   $ rm -f "$TMP/filtered.fasta"
 
 Try to filter with subsampling when there are more available groups than requested sequences.
-This should fail.
+This should fail, as probabilistic sampling is explicitly disabled.
 
   $ ${AUGUR} filter \
   >  --sequences filter/sequences.fasta \
@@ -28,12 +29,13 @@ This should fail.
   >  --group-by country year month \
   >  --subsample-max-sequences 5 \
   >  --subsample-seed 314159 \
+  >  --no-probabilistic-sampling \
   >  --output "$TMP/filtered.fasta"
   ERROR: Asked to provide at most 5 sequences, but there are 10 groups.
   [1]
   $ rm -f "$TMP/filtered.fasta"
 
-Use probabilistic subsampling to handle the case when there are more available groups than requested sequences.
+Explicitly use probabilistic subsampling to handle the case when there are more available groups than requested sequences.
 
   $ ${AUGUR} filter \
   >  --sequences filter/sequences.fasta \
@@ -45,3 +47,15 @@ Use probabilistic subsampling to handle the case when there are more available g
   >  --probabilistic-sampling \
   >  --output "$TMP/filtered.fasta" > /dev/null
   $ rm -f "$TMP/filtered.fasta"
+
+Using the default probabilistic subsampling, should work the same as the previous case.
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --group-by country year month \
+  >  --subsample-max-sequences 5 \
+  >  --subsample-seed 314159 \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ rm -f "$TMP/filtered.fasta"