nextstrain · huddlej · Mar 8, 2021 · Feb 13, 2021 · Feb 20, 2021 · Feb 20, 2021
diff --git a/augur/filter.py b/augur/filter.py
diff --git a/augur/utils.py b/augur/utils.py
@@ -686,3 +686,38 @@ def load_mask_sites(mask_file):
     "A", "G", "C", "T", "U", "N", "R", "Y", "S", "W", "K", "M", "B", "V", "D", "H", "-",
     "a", "g", "c", "t", "u", "n", "r", "y", "s", "w", "k", "m", "b", "v", "d", "h", "-"
 }
+
+
+def read_strains(*files, comment_char="#"):
+    """Reads strain names from one or more plain text files and returns the
+    set of distinct strains.
+
+    Strain names can be commented with full-line or inline comments. For
+    example, the following is a valid strain names file:
+
+        # this is a comment at the top of the file
+        strain1  # exclude strain1 because it isn't sequenced properly
+        strain2
+          # this is an empty line that will be ignored.
+
+    Parameters
+    ----------
+    files : one or more str
+        one or more names of text files with one strain name per line
+
+    Returns
+    -------
+    set :
+        strain names from the given input files
+
+    """
+    strains = set()
+    for input_file in files:
+        with open(input_file, 'r', encoding='utf-8') as ifile:
+            for line in ifile:
+                # Allow comments anywhere in a given line.
+                strain_name = line.split(comment_char)[0].strip()
+                if len(strain_name) > 0:
+                    strains.add(strain_name)
+
+    return strains
diff --git a/tests/builds/zika.t b/tests/builds/zika.t
@@ -43,8 +43,8 @@ Filter sequences by a minimum date and an exclusion list and only keep one seque
   >   --subsample-seed 314159 \
   >   --no-probabilistic-sampling \
   >   --min-date 2012 > /dev/null
-
-  $ diff -u "results/filtered.fasta" "$TMP/out/filtered.fasta"
+  $ grep "^>" "$TMP/out/filtered.fasta" | wc -l
+  \s*10 (re)
 
 Align filtered sequences to a specific reference sequence and fill any gaps.
 

diff --git a/tests/functional/filter.t b/tests/functional/filter.t
@@ -33,7 +33,7 @@ This should fail, as probabilistic sampling is explicitly disabled.
   >  --subsample-seed 314159 \
   >  --no-probabilistic-sampling \
   >  --output "$TMP/filtered.fasta"
-  ERROR: Asked to provide at most 5 sequences, but there are 10 groups.
+  ERROR: Asked to provide at most 5 sequences, but there are 8 groups.
   [1]
   $ rm -f "$TMP/filtered.fasta"
 
@@ -63,3 +63,188 @@ Using the default probabilistic subsampling, should work the same as the previou
   >  --subsample-seed 314159 \
   >  --output "$TMP/filtered.fasta" > /dev/null
   $ rm -f "$TMP/filtered.fasta"
+
+Filter using only metadata without sequence input or output and save results as filtered metadata.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+
+Output should include the 8 sequences matching the filters and a header line.
+
+  $ wc -l "$TMP/filtered_metadata.tsv"
+  \s*9 .* (re)
+  $ rm -f "$TMP/filtered_metadata.tsv"
+
+Filter using only metadata and save results as a list of filtered strains.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+
+Output should include only the 8 sequences matching the filters (without a header line).
+
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*8 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Filter using only metadata without a sequence index.
+This should work because the requested filters don't rely on sequence information.
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Try to filter using only metadata without a sequence index.
+This should fail because the requested filters rely on sequence information.
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+  [1]
+
+Try to filter with sequence outputs and no sequence inputs.
+This should fail.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  ERROR: You need to provide sequences to output sequences.
+  [1]
+
+Try to filter without any outputs.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 > /dev/null
+  ERROR: You need to select at least one output.
+  [1]
+
+Filter into two separate sets and then select sequences from the union of those sets.
+First, select strains from Brazil (there should be 1).
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --query "country == 'Brazil'" \
+  >  --output-strains "$TMP/filtered_strains.brazil.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.brazil.txt"
+  \s*1 .* (re)
+
+Then, select strains from Colombia (there should be 3).
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --query "country == 'Colombia'" \
+  >  --output-strains "$TMP/filtered_strains.colombia.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.colombia.txt"
+  \s*3 .* (re)
+
+Finally, exclude all sequences except those from the two sets of strains (there should be 4).
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --exclude-all \
+  >  --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*4 (re)
+  $ rm -f "$TMP/filtered.fasta"
+
+Alternately, exclude only the sequences from Brazil and Colombia (12 - 4 strains).
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --exclude "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*6 (re)
+  $ rm -f "$TMP/filtered.fasta"
+
+Try to filter with sequences that don't match any of the metadata.
+This should produce no results because the intersection of metadata and sequences is empty.
+
+  $ echo -e ">something\nATCG" > "$TMP/dummy.fasta"
+  $ ${AUGUR} filter \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/metadata.tsv \
+  >  --max-date 2020-01-30 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  WARNING: A sequence index was not provided, so we are generating one. Generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [1]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Repeat with sequence and strain outputs. We should get the same results.
+
+  $ ${AUGUR} filter \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/metadata.tsv \
+  >  --max-date 2020-01-30 \
+  >  --output-strains "$TMP/filtered_strains.txt" \
+  >  --output-sequences "$TMP/filtered.fasta" > /dev/null
+  WARNING: A sequence index was not provided, so we are generating one. Generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [1]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*0 (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+  $ rm -f "$TMP/filtered.fasta"
+
+Filter TB strains from VCF and save as a list of filtered strains.
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/tb.vcf.gz \
+  >  --metadata filter/tb_metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*3 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Confirm that filtering omits strains without metadata or sequences.
+The input sequences are missing one strain that is in the metadata.
+The metadata are missing one strain that has a sequence.
+The list of strains to include has one strain with no metadata/sequence and one strain with information that would have been filtered by country.
+The query initially filters 3 strains from Colombia, one of which is added back by the include.
+
+  $ echo "NotReal" > "$TMP/include.txt"
+  $ echo "COL/FLR_00008/2015" >> "$TMP/include.txt"
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --query "country != 'Colombia'" \
+  >  --include "$TMP/include.txt" \
+  >  --output-strains "$TMP/filtered_strains.txt"
+  4 strains were dropped during filtering
+  \t1 had no sequence data (esc)
+  \t1 had no metadata (esc)
+  \t3 of these were filtered out by the query: (esc)
+  \t\t"country != 'Colombia'" (esc)
+   (esc)
+  \t1 strains were added back because they were requested by include files (esc)
+  \t1 strains from include files were not added because they lacked sequence or metadata (esc)
+  8 strains passed all filters
+
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/metadata.tsv b/tests/functional/filter/metadata.tsv
@@ -1,5 +1,4 @@
 strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
-PAN/CDC_259359_V1_V3/2015	zika	KX156774	2015-12-18	North America	Panama	Panama	Panama	genbank	genome	Shabman et al	https://www.ncbi.nlm.nih.gov/nuccore/KX156774	Direct Submission	Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
 COL/FLR_00024/2015	zika	MF574569	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574569	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
 PRVABC59	zika	KU501215	2015-12-XX	North America	Puerto Rico	Puerto Rico	Puerto Rico	genbank	genome	Lanciotti et al	https://www.ncbi.nlm.nih.gov/nuccore/KU501215	Phylogeny of Zika Virus in Western Hemisphere, 2015	Emerging Infect. Dis. 22 (5), 933-935 (2016)	https://www.ncbi.nlm.nih.gov/pubmed/27088323
 COL/FLR_00008/2015	zika	MF574562	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574562	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/

diff --git a/tests/functional/filter/sequence_index.tsv b/tests/functional/filter/sequence_index.tsv
@@ -10,4 +10,3 @@ DOM/2016/BB_0059	10035	2563	2089	2741	2015	621	6	0	0	0
 BRA/2016/FC_6706	10366	2747	2203	2915	2165	329	7	0	0	0
 DOM/2016/BB_0183	10621	2910	2343	3099	2269	0	0	0	0	0
 EcEs062_16	10812	2960	2388	3158	2306	0	0	0	0	0
-HND/2016/HU_ME59	10365	2842	2271	3016	2233	0	3	0	0	0