nextstrain · rneher · Dec 28, 2020 · Nov 10, 2020 · Nov 10, 2020 · Dec 14, 2020
diff --git a/augur/filter.py b/augur/filter.py
@@ -100,6 +100,7 @@ def register_arguments(parser):
     subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
     subsample_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
     parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
+    parser.add_argument('--probabilistic-sampling', action='store_true', help="Sample probabilitically from groups -- useful when there are more groups than requested sequences")
     parser.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.")
     parser.add_argument('--exclude-where', nargs='+',
                                 help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
@@ -329,25 +330,46 @@ def run(args):
                 # this is only possible if we have imposed a maximum number of samples
                 # to produce.  we need binary search until we have the correct spg.
                 try:
-                    spg = _calculate_sequences_per_group(
-                        args.subsample_max_sequences,
-                        [len(sequences_in_group)
-                         for sequences_in_group in seq_names_by_group.values()],
-                    )
+                    length_of_sequences_per_group = [
+                        len(sequences_in_group)
+                        for sequences_in_group in seq_names_by_group.values()
+                    ]
+
+                    if args.probabilistic_sampling:
+                        spg = _calculate_fractional_sequences_per_group(
+                            args.subsample_max_sequences,
+                            length_of_sequences_per_group
+                        )
+                    else:
+                        spg = _calculate_sequences_per_group(
+                            args.subsample_max_sequences,
+                            length_of_sequences_per_group
+                        )
                 except TooManyGroupsError as ex:
                     print(f"ERROR: {ex}", file=sys.stderr)
                     sys.exit(1)
                 print("sampling at {} per group.".format(spg))
 
+            if args.probabilistic_sampling:
+                random_generator = np.random.default_rng()
+
             # subsample each groups, either by taking the spg highest priority strains or
             # sampling at random from the sequences in the group
             seq_subsample = []
             for group, sequences_in_group in seq_names_by_group.items():
+                if args.probabilistic_sampling:
+                    tmp_spg = random_generator.poisson(spg)
+                else:
+                    tmp_spg = spg
+
+                if tmp_spg == 0:
+                    continue
+
                 if args.priority: #sort descending by priority
-                    seq_subsample.extend(sorted(sequences_in_group, key=lambda x:priorities[x], reverse=True)[:spg])
+                    seq_subsample.extend(sorted(sequences_in_group, key=lambda x:priorities[x], reverse=True)[:tmp_spg])
                 else:
-                    seq_subsample.extend(sequences_in_group if len(sequences_in_group)<=spg
-                                         else random.sample(sequences_in_group, spg))
+                    seq_subsample.extend(sequences_in_group if len(sequences_in_group)<=tmp_spg
+                                         else random.sample(sequences_in_group, tmp_spg))
 
             num_excluded_subsamp = len(seq_keep) - len(seq_subsample)
             seq_keep = seq_subsample
@@ -485,8 +507,8 @@ def __str__(self):
 
 
 def _calculate_total_sequences(
-        hypothetical_spg: int, sequence_lengths: Collection[int],
-) -> int:
+        hypothetical_spg: float, sequence_lengths: Collection[int],
+) -> float:
     # calculate how many sequences we'd keep given a hypothetical spg.
     return sum(
         min(hypothetical_spg, sequence_length)
@@ -496,7 +518,7 @@ def _calculate_total_sequences(
 
 def _calculate_sequences_per_group(
         target_max_value: int,
-        sequence_lengths: Collection[int],
+        sequence_lengths: Collection[int]
 ) -> int:
     """This is partially inspired by
     https://github.com/python/cpython/blob/3.8/Lib/bisect.py
@@ -537,13 +559,62 @@ def _calculate_sequences_per_group(
 
     lo = 1
     hi = target_max_value
+
     while hi - lo > 2:
-        mid = (lo + hi) // 2
+        mid = (hi + lo) // 2
         if _calculate_total_sequences(mid, sequence_lengths) <= target_max_value:
             lo = mid
         else:
             hi = mid
+
     if _calculate_total_sequences(hi, sequence_lengths) <= target_max_value:
-        return hi
+        return int(hi)
     else:
-        return lo
+        return int(lo)
+
+
+def _calculate_fractional_sequences_per_group(
+        target_max_value: int,
+        sequence_lengths: Collection[int]
+) -> float:
+    """Returns the fractional sequences per group for the given list of group
+    sequences such that the total doesn't exceed the requested number of
+    samples.
+
+    Parameters
+    ----------
+    target_max_value : int
+        the total number of sequences allowed across all groups
+    sequence_lengths : Collection[int]
+        the number of sequences in each group
+
+    Returns
+    -------
+    float
+        fractional maximum number of sequences allowed per group to meet the
+        required maximum total sequences allowed
+
+    >>> np.around(_calculate_fractional_sequences_per_group(4, [4, 2]), 4)
+    1.9375
+    >>> np.around(_calculate_fractional_sequences_per_group(2, [4, 2]), 4)
+    0.9688
+
+    Unlike the integer-based version of this function, the fractional version
+    can accept a maximum number of sequences that exceeds the number of groups.
+    In this case, the function returns a fraction that can be used downstream,
+    for example with Poisson sampling.
+
+    >>> np.around(_calculate_fractional_sequences_per_group(1, [4, 2]), 4)
+    0.4844
+    """
+    lo = 1e-5
+    hi = target_max_value
+
+    while (hi / lo) > 1.1:
+        mid = (lo + hi) / 2
+        if _calculate_total_sequences(mid, sequence_lengths) <= target_max_value:
+            lo = mid
+        else:
+            hi = mid
+
+    return (lo + hi) / 2
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
@@ -0,0 +1,47 @@
+Integration tests for augur filter.
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="../../bin/augur"
+
+Filter with subsampling, requesting no more than 10 sequences.
+With 10 groups to subsample from, this should produce one sequence per group.
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --group-by country year month \
+  >  --subsample-max-sequences 10 \
+  >  --subsample-seed 314159 \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep ">" "$TMP/filtered.fasta" | wc -l
+  10
+  $ rm -f "$TMP/filtered.fasta"
+
+Try to filter with subsampling when there are more available groups than requested sequences.
+This should fail.
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --group-by country year month \
+  >  --subsample-max-sequences 5 \
+  >  --subsample-seed 314159 \
+  >  --output "$TMP/filtered.fasta"
+  ERROR: Asked to provide at most 5 sequences, but there are 10 groups.
+  [1]
+  $ rm -f "$TMP/filtered.fasta"
+
+Use probabilistic subsampling to handle the case when there are more available groups than requested sequences.
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --group-by country year month \
+  >  --subsample-max-sequences 5 \
+  >  --subsample-seed 314159 \
+  >  --probabilistic-sampling \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ rm -f "$TMP/filtered.fasta"
diff --git a/tests/functional/filter/metadata.tsv b/tests/functional/filter/metadata.tsv
@@ -0,0 +1,13 @@
+strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
+PAN/CDC_259359_V1_V3/2015	zika	KX156774	2015-12-18	North America	Panama	Panama	Panama	genbank	genome	Shabman et al	https://www.ncbi.nlm.nih.gov/nuccore/KX156774	Direct Submission	Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
+COL/FLR_00024/2015	zika	MF574569	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574569	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
+PRVABC59	zika	KU501215	2015-12-XX	North America	Puerto Rico	Puerto Rico	Puerto Rico	genbank	genome	Lanciotti et al	https://www.ncbi.nlm.nih.gov/nuccore/KU501215	Phylogeny of Zika Virus in Western Hemisphere, 2015	Emerging Infect. Dis. 22 (5), 933-935 (2016)	https://www.ncbi.nlm.nih.gov/pubmed/27088323
+COL/FLR_00008/2015	zika	MF574562	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574562	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
+Colombia/2016/ZC204Se	zika	KY317939	2016-01-06	South America	Colombia	Colombia	Colombia	genbank	genome	Quick et al	https://www.ncbi.nlm.nih.gov/nuccore/KY317939	Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples	Nat Protoc 12 (6), 1261-1276 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538739
+ZKC2/2016	zika	KX253996	2016-02-16	Oceania	American Samoa	American Samoa	American Samoa	genbank	genome	Wu et al	https://www.ncbi.nlm.nih.gov/nuccore/KX253996	Direct Submission	Submitted (18-MAY-2016) Center for Diseases Control and Prevention of Guangdong Province; National Institute of Viral Disease Control and Prevention, China	https://www.ncbi.nlm.nih.gov/pubmed/
+VEN/UF_1/2016	zika	KX702400	2016-03-25	South America	Venezuela	Venezuela	Venezuela	genbank	genome	Blohm et al	https://www.ncbi.nlm.nih.gov/nuccore/KX702400	Complete Genome Sequences of Identical Zika virus Isolates in a Nursing Mother and Her Infant	Genome Announc 5 (17), e00231-17 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28450510
+DOM/2016/BB_0059	zika	KY785425	2016-04-04	North America	Dominican Republic	Dominican Republic	Dominican Republic	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785425	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734
+BRA/2016/FC_6706	zika	KY785433	2016-04-08	South America	Brazil	Brazil	Brazil	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785433	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734
+DOM/2016/BB_0183	zika	KY785420	2016-04-18	North America	Dominican Republic	Dominican Republic	Dominican Republic	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785420	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734
+EcEs062_16	zika	KX879603	2016-04-XX	South America	Ecuador	Ecuador	Ecuador	genbank	genome	Marquez et al	https://www.ncbi.nlm.nih.gov/nuccore/KX879603	First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador	Genome Announc 5 (8), e01673-16 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28232448
+HND/2016/HU_ME59	zika	KY785418	2016-05-13	North America	Honduras	Honduras	Honduras	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785418	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734