Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/probabilistic sampling #629

Merged
merged 4 commits into from
Dec 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 85 additions & 14 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def register_arguments(parser):
subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
subsample_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
parser.add_argument('--probabilistic-sampling', action='store_true', help="Sample probabilitically from groups -- useful when there are more groups than requested sequences")
parser.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.")
parser.add_argument('--exclude-where', nargs='+',
help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
Expand Down Expand Up @@ -329,25 +330,46 @@ def run(args):
# this is only possible if we have imposed a maximum number of samples
# to produce. we need binary search until we have the correct spg.
try:
spg = _calculate_sequences_per_group(
args.subsample_max_sequences,
[len(sequences_in_group)
for sequences_in_group in seq_names_by_group.values()],
)
length_of_sequences_per_group = [
len(sequences_in_group)
for sequences_in_group in seq_names_by_group.values()
]

if args.probabilistic_sampling:
spg = _calculate_fractional_sequences_per_group(
args.subsample_max_sequences,
length_of_sequences_per_group
)
else:
spg = _calculate_sequences_per_group(
args.subsample_max_sequences,
length_of_sequences_per_group
)
except TooManyGroupsError as ex:
print(f"ERROR: {ex}", file=sys.stderr)
sys.exit(1)
print("sampling at {} per group.".format(spg))

if args.probabilistic_sampling:
random_generator = np.random.default_rng()

# subsample each groups, either by taking the spg highest priority strains or
# sampling at random from the sequences in the group
seq_subsample = []
for group, sequences_in_group in seq_names_by_group.items():
if args.probabilistic_sampling:
tmp_spg = random_generator.poisson(spg)
else:
tmp_spg = spg

if tmp_spg == 0:
continue

if args.priority: #sort descending by priority
seq_subsample.extend(sorted(sequences_in_group, key=lambda x:priorities[x], reverse=True)[:spg])
seq_subsample.extend(sorted(sequences_in_group, key=lambda x:priorities[x], reverse=True)[:tmp_spg])
else:
seq_subsample.extend(sequences_in_group if len(sequences_in_group)<=spg
else random.sample(sequences_in_group, spg))
seq_subsample.extend(sequences_in_group if len(sequences_in_group)<=tmp_spg
else random.sample(sequences_in_group, tmp_spg))

num_excluded_subsamp = len(seq_keep) - len(seq_subsample)
seq_keep = seq_subsample
Expand Down Expand Up @@ -485,8 +507,8 @@ def __str__(self):


def _calculate_total_sequences(
hypothetical_spg: int, sequence_lengths: Collection[int],
) -> int:
hypothetical_spg: float, sequence_lengths: Collection[int],
) -> float:
# calculate how many sequences we'd keep given a hypothetical spg.
return sum(
min(hypothetical_spg, sequence_length)
Expand All @@ -496,7 +518,7 @@ def _calculate_total_sequences(

def _calculate_sequences_per_group(
target_max_value: int,
sequence_lengths: Collection[int],
sequence_lengths: Collection[int]
) -> int:
"""This is partially inspired by
https://github.com/python/cpython/blob/3.8/Lib/bisect.py
Expand Down Expand Up @@ -537,13 +559,62 @@ def _calculate_sequences_per_group(

lo = 1
hi = target_max_value

while hi - lo > 2:
mid = (lo + hi) // 2
mid = (hi + lo) // 2
if _calculate_total_sequences(mid, sequence_lengths) <= target_max_value:
lo = mid
else:
hi = mid

if _calculate_total_sequences(hi, sequence_lengths) <= target_max_value:
return hi
return int(hi)
else:
return lo
return int(lo)


def _calculate_fractional_sequences_per_group(
target_max_value: int,
sequence_lengths: Collection[int]
) -> float:
"""Returns the fractional sequences per group for the given list of group
sequences such that the total doesn't exceed the requested number of
samples.

Parameters
----------
target_max_value : int
the total number of sequences allowed across all groups
sequence_lengths : Collection[int]
the number of sequences in each group

Returns
-------
float
fractional maximum number of sequences allowed per group to meet the
required maximum total sequences allowed

>>> np.around(_calculate_fractional_sequences_per_group(4, [4, 2]), 4)
1.9375
>>> np.around(_calculate_fractional_sequences_per_group(2, [4, 2]), 4)
0.9688

Unlike the integer-based version of this function, the fractional version
can accept a maximum number of sequences that exceeds the number of groups.
In this case, the function returns a fraction that can be used downstream,
for example with Poisson sampling.

>>> np.around(_calculate_fractional_sequences_per_group(1, [4, 2]), 4)
0.4844
"""
lo = 1e-5
hi = target_max_value

while (hi / lo) > 1.1:
mid = (lo + hi) / 2
if _calculate_total_sequences(mid, sequence_lengths) <= target_max_value:
lo = mid
else:
hi = mid

return (lo + hi) / 2
47 changes: 47 additions & 0 deletions tests/functional/filter.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
Integration tests for augur filter.

$ pushd "$TESTDIR" > /dev/null
$ export AUGUR="../../bin/augur"

Filter with subsampling, requesting no more than 10 sequences.
With 10 groups to subsample from, this should produce one sequence per group.

$ ${AUGUR} filter \
> --sequences filter/sequences.fasta \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --group-by country year month \
> --subsample-max-sequences 10 \
> --subsample-seed 314159 \
> --output "$TMP/filtered.fasta" > /dev/null
$ grep ">" "$TMP/filtered.fasta" | wc -l
10
$ rm -f "$TMP/filtered.fasta"

Try to filter with subsampling when there are more available groups than requested sequences.
This should fail.

$ ${AUGUR} filter \
> --sequences filter/sequences.fasta \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --group-by country year month \
> --subsample-max-sequences 5 \
> --subsample-seed 314159 \
> --output "$TMP/filtered.fasta"
ERROR: Asked to provide at most 5 sequences, but there are 10 groups.
[1]
$ rm -f "$TMP/filtered.fasta"

Use probabilistic subsampling to handle the case when there are more available groups than requested sequences.

$ ${AUGUR} filter \
> --sequences filter/sequences.fasta \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --group-by country year month \
> --subsample-max-sequences 5 \
> --subsample-seed 314159 \
> --probabilistic-sampling \
> --output "$TMP/filtered.fasta" > /dev/null
$ rm -f "$TMP/filtered.fasta"
13 changes: 13 additions & 0 deletions tests/functional/filter/metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
strain virus accession date region country division city db segment authors url title journal paper_url
PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 Direct Submission Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323
COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples Nat Protoc 12 (6), 1261-1276 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538739
ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996 Direct Submission Submitted (18-MAY-2016) Center for Diseases Control and Prevention of Guangdong Province; National Institute of Viral Disease Control and Prevention, China https://www.ncbi.nlm.nih.gov/pubmed/
VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400 Complete Genome Sequences of Identical Zika virus Isolates in a Nursing Mother and Her Infant Genome Announc 5 (17), e00231-17 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28450510
DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador Genome Announc 5 (8), e01673-16 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28232448
HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
Loading