diff --git a/augur/filter.py b/augur/filter.py
index 39e909cd3..6e08f453e 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -14,7 +14,7 @@
 import treetime.utils
 
 from .index import index_sequences
-from .utils import read_metadata, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
+from .utils import read_metadata, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
 
 comment_char = '#'
 MAX_NUMBER_OF_PROBABILISTIC_SAMPLING_ATTEMPTS = 10
@@ -92,32 +92,49 @@ def filter_by_query(sequences, metadata_file, query):
     return [seq for seq in sequences if seq in filtered_meta_dict]
 
 def register_arguments(parser):
-    parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta or VCF format")
-    parser.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
-    parser.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
-    parser.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
-    parser.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
-    parser.add_argument('--min-length', type=int, help="minimal length of the sequences")
-    parser.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
-    parser.add_argument('--exclude', type=str, help="file with list of strains that are to be excluded")
-    parser.add_argument('--include', type=str, help="file with list of strains that are to be included regardless of priorities or subsampling")
-    parser.add_argument('--priority', type=str, help="file with list of priority scores for sequences (strain\tpriority)")
-    subsample_group = parser.add_mutually_exclusive_group()
-    subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
-    subsample_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
-    parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
-    probabilistic_sampling_group = parser.add_mutually_exclusive_group()
-    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
-    probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
-    parser.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.")
-    parser.add_argument('--exclude-where', nargs='+',
+    input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
+    input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
+    input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
+    input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
+
+    metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
+    metadata_filter_group.add_argument(
+        '--query',
+        help="""Filter samples by attribute.
+        Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
+        (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
+    )
+    metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+    metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+    metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
+                                help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
+    metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
+    metadata_filter_group.add_argument('--exclude-where', nargs='+',
                                 help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
-    parser.add_argument('--include-where', nargs='+',
+    metadata_filter_group.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
+    metadata_filter_group.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains to include regardless of priorities or subsampling")
+    metadata_filter_group.add_argument('--include-where', nargs='+',
                                 help="Include samples with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be included), not AND. This rule is applied last and ensures any sequences matching these rules will be included.")
-    parser.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
-                                help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
-    parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.")
-    parser.add_argument('--output', '-o', help="output file", required=True)
+    metadata_filter_group.add_argument('--priority', type=str, help="file with list of priority scores for strains (strain\tpriority)")
+
+    sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
+    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences")
+    sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
+
+    subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
+    subsample_group.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
+    subsample_limits_group = subsample_group.add_mutually_exclusive_group()
+    subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
+    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
+    probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
+    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
+    probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
+    subsample_group.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.")
+
+    output_group = parser.add_argument_group("outputs", "possible representations of filtered data (at least one required)")
+    output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
+    output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
+    output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
 
     parser.set_defaults(probabilistic_sampling=True)
 
@@ -125,10 +142,48 @@ def run(args):
     '''
     filter and subsample a set of sequences into an analysis set
     '''
+    # Validate arguments before attempting any I/O.
+    # Don't allow sequence output when no sequence input is provided.
+    if args.output and not args.sequences:
+        print(
+            "ERROR: You need to provide sequences to output sequences.",
+            file=sys.stderr)
+        return 1
+
+    # Confirm that at least one output was requested.
+    if not any((args.output, args.output_metadata, args.output_strains)):
+        print(
+            "ERROR: You need to select at least one output.",
+            file=sys.stderr)
+        return 1
+
+    # Don't allow filtering on sequence-based information, if no sequences or
+    # sequence index is provided.
+    SEQUENCE_ONLY_FILTERS = [
+        args.min_length,
+        args.non_nucleotide
+    ]
+    if not args.sequences and not args.sequence_index and any(SEQUENCE_ONLY_FILTERS):
+        print(
+            "ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.",
+            file=sys.stderr)
+        return 1
+
+    # Load inputs, starting with metadata.
+    try:
+        # Metadata are the source of truth for which sequences we want to keep
+        # in filtered output.
+        meta_dict, meta_columns = read_metadata(args.metadata)
+        metadata_strains = set(meta_dict.keys())
+    except ValueError as error:
+        print("ERROR: Problem reading in {}:".format(args.metadata))
+        print(error)
+        return 1
+
     #Set flags if VCF
     is_vcf = False
     is_compressed = False
-    if any([args.sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
+    if args.sequences and any([args.sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
         is_vcf = True
         if args.sequences.lower().endswith('.gz'):
             is_compressed = True
@@ -142,12 +197,13 @@ def run(args):
                   "Please see the augur install instructions to install it.")
             return 1
 
-    ####Read in files
+    # Read in files
 
-    #If VCF, open and get sequence names
+    # If VCF, open and get sequence names
     if is_vcf:
-        seq_keep, all_seq = read_vcf(args.sequences)
-    else:
+        vcf_sequences, _ = read_vcf(args.sequences)
+        sequence_strains = set(vcf_sequences)
+    elif args.sequences or args.sequence_index:
         # If FASTA, try to load the sequence composition details and strain
         # names to be filtered.
         index_is_autogenerated = False
@@ -180,45 +236,46 @@ def run(args):
 
         # Calculate summary statistics needed for filtering.
         sequence_index["ACGT"] = sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1)
-        seq_keep = sequence_index["strain"].values
-        all_seq = seq_keep.copy()
+        sequence_strains = set(sequence_index["strain"].values)
+    else:
+        sequence_strains = None
 
-    try:
-        meta_dict, meta_columns = read_metadata(args.metadata)
-    except ValueError as error:
-        print("ERROR: Problem reading in {}:".format(args.metadata))
-        print(error)
-        return 1
+    if sequence_strains is not None:
+        # Calculate the number of strains that don't exist in either metadata or sequences.
+        num_excluded_by_lack_of_metadata = len(sequence_strains - metadata_strains)
+        num_excluded_by_lack_of_sequences = len(metadata_strains - sequence_strains)
+
+        # Intersect sequence strain names with metadata strains.
+        available_strains = metadata_strains & sequence_strains
+    else:
+        num_excluded_by_lack_of_metadata = None
+        num_excluded_by_lack_of_sequences = None
+
+        # When no sequence data are available, we treat the metadata as the
+        # source of truth.
+        available_strains = metadata_strains
 
+    # Track the strains that are available to select by the filters below, after
+    # accounting for availability of metadata and sequences.
+    seq_keep = available_strains.copy()
 
     #####################################
     #Filtering steps
     #####################################
 
-    # remove sequences without meta data
-    tmp = [ ]
-    for seq_name in seq_keep:
-        if seq_name in meta_dict:
-            tmp.append(seq_name)
-        else:
-            print("No meta data for %s, excluding from all further analysis."%seq_name)
-    seq_keep = tmp
+    # Exclude all strains by default.
+    if args.exclude_all:
+        num_excluded_by_all = len(available_strains)
+        seq_keep = set()
 
     # remove strains explicitly excluded by name
     # read list of strains to exclude from file and prune seq_keep
     num_excluded_by_name = 0
     if args.exclude:
         try:
-            with open(args.exclude, 'r', encoding='utf-8') as ifile:
-                to_exclude = set()
-                for line in ifile:
-                    if line[0] != comment_char:
-                        # strip whitespace and remove all text following comment character
-                        exclude_name = line.split(comment_char)[0].strip()
-                        to_exclude.add(exclude_name)
-            tmp = [seq_name for seq_name in seq_keep if seq_name not in to_exclude]
-            num_excluded_by_name = len(seq_keep) - len(tmp)
-            seq_keep = tmp
+            to_exclude = read_strains(*args.exclude)
+            num_excluded_by_name = len(seq_keep & to_exclude)
+            seq_keep = seq_keep - to_exclude
         except FileNotFoundError as e:
             print("ERROR: Could not open file of excluded strains '%s'" % args.exclude, file=sys.stderr)
             sys.exit(1)
@@ -241,15 +298,15 @@ def run(args):
                     else: # i.e. property=value requested
                         if meta_dict[seq_name].get(col,'unknown').lower() == val.lower():
                             to_exclude.add(seq_name)
-                tmp = [seq_name for seq_name in seq_keep if seq_name not in to_exclude]
-                num_excluded_by_metadata[ex] = len(seq_keep) - len(tmp)
-                seq_keep = tmp
+
+                num_excluded_by_metadata[ex] = len(seq_keep & to_exclude)
+                seq_keep = seq_keep - to_exclude
 
     # exclude strains by metadata, using Pandas querying
     num_excluded_by_query = 0
     if args.query:
-        filtered = filter_by_query(seq_keep, args.metadata, args.query)
-        num_excluded_by_query = len(seq_keep) - len(filtered)
+        filtered = set(filter_by_query(list(seq_keep), args.metadata, args.query))
+        num_excluded_by_query = len(seq_keep - filtered)
         seq_keep = filtered
 
     # filter by sequence length
@@ -261,9 +318,11 @@ def run(args):
             is_in_seq_keep = sequence_index["strain"].isin(seq_keep)
             is_gte_min_length = sequence_index["ACGT"] >= args.min_length
 
-            seq_keep_by_length = sequence_index[
-                (is_in_seq_keep) & (is_gte_min_length)
-            ]["strain"].tolist()
+            seq_keep_by_length = set(
+                sequence_index[
+                    (is_in_seq_keep) & (is_gte_min_length)
+                ]["strain"].tolist()
+            )
 
             num_excluded_by_length = len(seq_keep) - len(seq_keep_by_length)
             seq_keep = seq_keep_by_length
@@ -271,10 +330,10 @@ def run(args):
     # filter by ambiguous dates
     num_excluded_by_ambiguous_date = 0
     if args.exclude_ambiguous_dates_by and 'date' in meta_columns:
-        seq_keep_by_date = []
+        seq_keep_by_date = set()
         for seq_name in seq_keep:
             if not is_date_ambiguous(meta_dict[seq_name]['date'], args.exclude_ambiguous_dates_by):
-                seq_keep_by_date.append(seq_name)
+                seq_keep_by_date.add(seq_name)
 
         num_excluded_by_ambiguous_date = len(seq_keep) - len(seq_keep_by_date)
         seq_keep = seq_keep_by_date
@@ -283,11 +342,11 @@ def run(args):
     num_excluded_by_date = 0
     if (args.min_date or args.max_date) and 'date' in meta_columns:
         dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d")
-        tmp = [s for s in seq_keep if dates[s] is not None]
+        tmp = {s for s in seq_keep if dates[s] is not None}
         if args.min_date:
-            tmp = [s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date]
+            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date}
         if args.max_date:
-            tmp = [s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<args.max_date]
+            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<args.max_date}
         num_excluded_by_date = len(seq_keep) - len(tmp)
         seq_keep = tmp
 
@@ -296,9 +355,11 @@ def run(args):
     if args.non_nucleotide:
         is_in_seq_keep = sequence_index["strain"].isin(seq_keep)
         no_invalid_nucleotides = sequence_index["invalid_nucleotides"] == 0
-        seq_keep_by_valid_nucleotides = sequence_index[
-            (is_in_seq_keep) & (no_invalid_nucleotides)
-        ]["strain"].tolist()
+        seq_keep_by_valid_nucleotides = set(
+            sequence_index[
+                (is_in_seq_keep) & (no_invalid_nucleotides)
+            ]["strain"].tolist()
+        )
 
         num_excluded_by_nuc = len(seq_keep) - len(seq_keep_by_valid_nucleotides)
         seq_keep = seq_keep_by_valid_nucleotides
@@ -393,7 +454,7 @@ def run(args):
 
             # subsample each groups, either by taking the spg highest priority strains or
             # sampling at random from the sequences in the group
-            seq_subsample = []
+            seq_subsample = set()
             subsampling_attempts = 0
 
             # Attempt to subsample with the given constraints for a fixed number
@@ -414,10 +475,23 @@ def run(args):
                         continue
 
                     if args.priority: #sort descending by priority
-                        seq_subsample.extend(sorted(sequences_in_group, key=lambda x:priorities[x], reverse=True)[:tmp_spg])
+                        seq_subsample.update(
+                            set(
+                                sorted(
+                                    sequences_in_group,
+                                    key=lambda x: priorities[x],
+                                    reverse=True
+                                )[:tmp_spg]
+                            )
+                        )
                     else:
-                        seq_subsample.extend(sequences_in_group if len(sequences_in_group)<=tmp_spg
-                                            else random.sample(sequences_in_group, tmp_spg))
+                        seq_subsample.update(
+                            set(
+                                sequences_in_group
+                                if len(sequences_in_group)<=tmp_spg
+                                else random.sample(sequences_in_group, tmp_spg)
+                            )
+                        )
 
             num_excluded_subsamp = len(seq_keep) - len(seq_subsample)
             seq_keep = seq_subsample
@@ -426,25 +500,26 @@ def run(args):
     # Note that this might re-add previously excluded sequences
     # Note that we are also not checking for existing meta data here
     num_included_by_name = 0
-    if args.include and os.path.isfile(args.include):
-        with open(args.include, 'r', encoding='utf-8') as ifile:
-            to_include = set(
-                [
-                    line.strip()
-                    for line in ifile
-                    if line[0]!=comment_char and len(line.strip()) > 0
-                ]
-            )
+    if args.include:
+        # Collect the union of all given strains to include.
+        to_include = read_strains(*args.include)
 
-        for s in to_include:
-            if s not in seq_keep:
-                seq_keep.append(s)
-                num_included_by_name += 1
+        # Find requested strains that can be included because they have metadata
+        # and sequences.
+        available_to_include = available_strains & to_include
+
+        # Track the number of strains that could and could not be included.
+        num_included_by_name = len(available_to_include)
+        num_not_included_by_name = len(to_include - available_to_include)
+
+        # Union the strains that can be included with the sequences to keep.
+        seq_keep = seq_keep | available_to_include
 
     # add sequences with particular meta data attributes
     num_included_by_metadata = 0
     if args.include_where:
-        to_include = []
+        to_include = set()
+
         for ex in args.include_where:
             try:
                 col, val = ex.split("=")
@@ -453,50 +528,90 @@ def run(args):
                 continue
 
             # loop over all sequences and re-add sequences
-            for seq_name in all_seq:
-                if seq_name in meta_dict:
-                    if meta_dict[seq_name].get(col)==val:
-                        to_include.append(seq_name)
-                else:
-                    print("WARNING: no metadata for %s, skipping"%seq_name)
-                    continue
-
-        for s in to_include:
-            if s not in seq_keep:
-                seq_keep.append(s)
-                num_included_by_metadata += 1
+            for seq_name in available_strains:
+                if meta_dict[seq_name].get(col)==val:
+                    to_include.add(seq_name)
+
+        num_included_by_metadata = len(to_include)
+        seq_keep = seq_keep | to_include
+
+    # Write output starting with sequences, if they've been requested. It is
+    # possible for the input sequences and sequence index to be out of sync
+    # (e.g., the index is a superset of the given sequences input), so we need
+    # to update the set of strains to keep based on which strains are actually
+    # available.
+    if is_vcf and args.output:
+        # Get the samples to be deleted, not to keep, for VCF
+        dropped_samps = list(available_strains - seq_keep)
+        write_vcf(args.sequences, args.output, dropped_samps)
+    elif args.sequences and args.output:
+        sequences = SeqIO.parse(args.sequences, "fasta")
 
-    ####Write out files
+        # Stream to disk all sequences that passed all filters to avoid reading
+        # sequences into memory first. Track the observed strain names in the
+        # sequence file as part of the single pass to allow comparison with the
+        # provided sequence index.
+        observed_sequence_strains = set()
+        with open(args.output, "w") as output_handle:
+            for sequence in sequences:
+                observed_sequence_strains.add(sequence.id)
+
+                if sequence.id in seq_keep:
+                    SeqIO.write(sequence, output_handle, 'fasta')
+
+        if sequence_strains != observed_sequence_strains:
+            # Warn the user if the expected strains from the sequence index are
+            # not a superset of the observed strains.
+            if not observed_sequence_strains <= sequence_strains:
+                print(
+                    "WARNING: The sequence index is out of sync with the provided sequences.",
+                    "Augur will only output strains with available sequences.",
+                    file=sys.stderr
+                )
+
+            # Update the set of available sequence strains and which of these
+            # strains passed filters. This prevents writing out strain lists or
+            # metadata for strains that have no sequences.
+            sequence_strains = observed_sequence_strains
+            seq_keep = seq_keep & sequence_strains
+
+            # Calculate the number of strains that don't exist in either
+            # metadata or sequences.
+            num_excluded_by_lack_of_metadata = len(sequence_strains - metadata_strains)
+            num_excluded_by_lack_of_sequences = len(metadata_strains - sequence_strains)
+
+    if args.output_metadata:
+        metadata_df = pd.DataFrame([meta_dict[strain] for strain in seq_keep])
+        metadata_df.to_csv(
+            args.output_metadata,
+            sep="\t",
+            index=False
+        )
 
-    if is_vcf:
-        #get the samples to be deleted, not to keep, for VCF
-        dropped_samps = list(set(all_seq) - set(seq_keep))
-        if len(dropped_samps) == len(all_seq): #All samples have been dropped! Stop run, warn user.
-            print("ERROR: All samples have been dropped! Check filter rules and metadata file format.")
-            return 1
-        write_vcf(args.sequences, args.output, dropped_samps)
+    if args.output_strains:
+        with open(args.output_strains, "w") as oh:
+            for strain in sorted(seq_keep):
+                oh.write(f"{strain}\n")
 
+    # Calculate the number of strains passed and filtered.
+    if sequence_strains is not None:
+        all_strains = metadata_strains | sequence_strains
     else:
-        # It should not be possible to have ids in the list of sequences to keep
-        # that do not exist in the original input sequences, since we built this
-        # list of ids from the sequence index. Just to be safe though, we find
-        # the intersection of these two lists of ids to determine if all samples
-        # were dropped or not. This final list of ids is in the same order as
-        # the input sequences such that output sequences are always in the same
-        # order for a given set of filters.
-        sequences = SeqIO.parse(args.sequences, "fasta")
-        sequences_to_write = (sequence for sequence in sequences if sequence.id in seq_keep)
+        all_strains = metadata_strains
 
-        # Write out sequences that passed all filters using an iterator to
-        # ensure that sequences are streamed to disk without being read into
-        # memory first.
-        sequences_written = SeqIO.write(sequences_to_write, args.output, 'fasta')
+    total_strains_passed = len(seq_keep)
+    total_strains_filtered = len(all_strains) - total_strains_passed
 
-        if sequences_written == 0:
-            print("ERROR: All samples have been dropped! Check filter rules and metadata file format.", file=sys.stderr)
-            return 1
+    print(f"{total_strains_filtered} strains were dropped during filtering")
+
+    if num_excluded_by_lack_of_sequences:
+        print(f"\t{num_excluded_by_lack_of_sequences} had no sequence data")
+
+    if num_excluded_by_lack_of_metadata:
+        print(f"\t{num_excluded_by_lack_of_metadata} had no metadata")
 
-    print("\n%i sequences were dropped during filtering" % (len(all_seq) - len(seq_keep),))
+    if args.exclude_all:
+        print(f"\t{num_excluded_by_all} of these were dropped by `--exclude-all`")
     if args.exclude:
         print("\t%i of these were dropped because they were in %s" % (num_excluded_by_name, args.exclude))
     if args.exclude_where:
@@ -516,12 +631,19 @@ def run(args):
         seed_txt = ", using seed {}".format(args.subsample_seed) if args.subsample_seed else ""
         print("\t%i of these were dropped because of subsampling criteria%s" % (num_excluded_subsamp, seed_txt))
 
-    if args.include and os.path.isfile(args.include):
-        print("\n\t%i sequences were added back because they were in %s" % (num_included_by_name, args.include))
+    if args.include:
+        print(f"\n\t{num_included_by_name} strains were added back because they were requested by include files")
+
+        if num_not_included_by_name:
+            print(f"\t{num_not_included_by_name} strains from include files were not added because they lacked sequence or metadata")
     if args.include_where:
         print("\t%i sequences were added back because of '%s'" % (num_included_by_metadata, args.include_where))
 
-    print("%i sequences have been written out to %s" % (len(seq_keep), args.output))
+    if total_strains_passed == 0:
+        print("ERROR: All samples have been dropped! Check filter rules and metadata file format.", file=sys.stderr)
+        return 1
+
+    print(f"{total_strains_passed} strains passed all filters")
 
 
 def _filename_gz(filename):
diff --git a/augur/utils.py b/augur/utils.py
index 5fb1ed923..b68c484e3 100644
--- a/augur/utils.py
+++ b/augur/utils.py
@@ -686,3 +686,38 @@ def load_mask_sites(mask_file):
     "A", "G", "C", "T", "U", "N", "R", "Y", "S", "W", "K", "M", "B", "V", "D", "H", "-",
     "a", "g", "c", "t", "u", "n", "r", "y", "s", "w", "k", "m", "b", "v", "d", "h", "-"
 }
+
+
+def read_strains(*files, comment_char="#"):
+    """Reads strain names from one or more plain text files and returns the
+    set of distinct strains.
+
+    Strain names can be commented with full-line or inline comments. For
+    example, the following is a valid strain names file:
+
+        # this is a comment at the top of the file
+        strain1  # exclude strain1 because it isn't sequenced properly
+        strain2
+          # this is an empty line that will be ignored.
+
+    Parameters
+    ----------
+    files : one or more str
+        one or more names of text files with one strain name per line
+
+    Returns
+    -------
+    set :
+        strain names from the given input files
+
+    """
+    strains = set()
+    for input_file in files:
+        with open(input_file, 'r', encoding='utf-8') as ifile:
+            for line in ifile:
+                # Allow comments anywhere in a given line.
+                strain_name = line.split(comment_char)[0].strip()
+                if len(strain_name) > 0:
+                    strains.add(strain_name)
+
+    return strains
diff --git a/tests/builds/zika.t b/tests/builds/zika.t
index c662cb1a2..5e337732b 100644
--- a/tests/builds/zika.t
+++ b/tests/builds/zika.t
@@ -43,8 +43,8 @@ Filter sequences by a minimum date and an exclusion list and only keep one seque
   >   --subsample-seed 314159 \
   >   --no-probabilistic-sampling \
   >   --min-date 2012 > /dev/null
-
-  $ diff -u "results/filtered.fasta" "$TMP/out/filtered.fasta"
+  $ grep "^>" "$TMP/out/filtered.fasta" | wc -l
+  \s*10 (re)
 
 Align filtered sequences to a specific reference sequence and fill any gaps.
 
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
index 65e6877bb..3d15b2bbe 100644
--- a/tests/functional/filter.t
+++ b/tests/functional/filter.t
@@ -33,7 +33,7 @@ This should fail, as probabilistic sampling is explicitly disabled.
   >  --subsample-seed 314159 \
   >  --no-probabilistic-sampling \
   >  --output "$TMP/filtered.fasta"
-  ERROR: Asked to provide at most 5 sequences, but there are 10 groups.
+  ERROR: Asked to provide at most 5 sequences, but there are 8 groups.
   [1]
   $ rm -f "$TMP/filtered.fasta"
 
@@ -63,3 +63,188 @@ Using the default probabilistic subsampling, should work the same as the previou
   >  --subsample-seed 314159 \
   >  --output "$TMP/filtered.fasta" > /dev/null
   $ rm -f "$TMP/filtered.fasta"
+
+Filter using only metadata without sequence input or output and save results as filtered metadata.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+
+Output should include the 8 sequences matching the filters and a header line.
+
+  $ wc -l "$TMP/filtered_metadata.tsv"
+  \s*9 .* (re)
+  $ rm -f "$TMP/filtered_metadata.tsv"
+
+Filter using only metadata and save results as a list of filtered strains.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+
+Output should include only the 8 sequences matching the filters (without a header line).
+
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*8 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Filter using only metadata without a sequence index.
+This should work because the requested filters don't rely on sequence information.
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --min-date 2012 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Try to filter using only metadata without a sequence index.
+This should fail because the requested filters rely on sequence information.
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+  [1]
+
+Try to filter with sequence outputs and no sequence inputs.
+This should fail.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  ERROR: You need to provide sequences to output sequences.
+  [1]
+
+Try to filter without any outputs.
+
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --min-length 10000 > /dev/null
+  ERROR: You need to select at least one output.
+  [1]
+
+Filter into two separate sets and then select sequences from the union of those sets.
+First, select strains from Brazil (there should be 1).
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --query "country == 'Brazil'" \
+  >  --output-strains "$TMP/filtered_strains.brazil.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.brazil.txt"
+  \s*1 .* (re)
+
+Then, select strains from Colombia (there should be 3).
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --query "country == 'Colombia'" \
+  >  --output-strains "$TMP/filtered_strains.colombia.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.colombia.txt"
+  \s*3 .* (re)
+
+Finally, exclude all sequences except those from the two sets of strains (there should be 4).
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --exclude-all \
+  >  --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*4 (re)
+  $ rm -f "$TMP/filtered.fasta"
+
+Alternately, exclude only the sequences from Brazil and Colombia (12 - 4 strains).
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/sequences.fasta \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --exclude "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*6 (re)
+  $ rm -f "$TMP/filtered.fasta"
+
+Try to filter with sequences that don't match any of the metadata.
+This should produce no results because the intersection of metadata and sequences is empty.
+
+  $ echo -e ">something\nATCG" > "$TMP/dummy.fasta"
+  $ ${AUGUR} filter \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/metadata.tsv \
+  >  --max-date 2020-01-30 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  WARNING: A sequence index was not provided, so we are generating one. Generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [1]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Repeat with sequence and strain outputs. We should get the same results.
+
+  $ ${AUGUR} filter \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/metadata.tsv \
+  >  --max-date 2020-01-30 \
+  >  --output-strains "$TMP/filtered_strains.txt" \
+  >  --output-sequences "$TMP/filtered.fasta" > /dev/null
+  WARNING: A sequence index was not provided, so we are generating one. Generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [1]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*0 (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+  $ rm -f "$TMP/filtered.fasta"
+
+Filter TB strains from VCF and save as a list of filtered strains.
+
+  $ ${AUGUR} filter \
+  >  --sequences filter/tb.vcf.gz \
+  >  --metadata filter/tb_metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*3 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Confirm that filtering omits strains without metadata or sequences.
+The input sequences are missing one strain that is in the metadata.
+The metadata are missing one strain that has a sequence.
+The list of strains to include has one strain with no metadata/sequence and one strain with information that would have been filtered by country.
+The query initially filters 3 strains from Colombia, one of which is added back by the include.
+
+  $ echo "NotReal" > "$TMP/include.txt"
+  $ echo "COL/FLR_00008/2015" >> "$TMP/include.txt"
+  $ ${AUGUR} filter \
+  >  --sequence-index filter/sequence_index.tsv \
+  >  --metadata filter/metadata.tsv \
+  >  --query "country != 'Colombia'" \
+  >  --include "$TMP/include.txt" \
+  >  --output-strains "$TMP/filtered_strains.txt"
+  4 strains were dropped during filtering
+  \t1 had no sequence data (esc)
+  \t1 had no metadata (esc)
+  \t3 of these were filtered out by the query: (esc)
+  \t\t"country != 'Colombia'" (esc)
+   (esc)
+  \t1 strains were added back because they were requested by include files (esc)
+  \t1 strains from include files were not added because they lacked sequence or metadata (esc)
+  8 strains passed all filters
+
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/metadata.tsv b/tests/functional/filter/metadata.tsv
index 319d519b6..dc66a193c 100644
--- a/tests/functional/filter/metadata.tsv
+++ b/tests/functional/filter/metadata.tsv
@@ -1,5 +1,4 @@
 strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
-PAN/CDC_259359_V1_V3/2015	zika	KX156774	2015-12-18	North America	Panama	Panama	Panama	genbank	genome	Shabman et al	https://www.ncbi.nlm.nih.gov/nuccore/KX156774	Direct Submission	Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
 COL/FLR_00024/2015	zika	MF574569	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574569	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
 PRVABC59	zika	KU501215	2015-12-XX	North America	Puerto Rico	Puerto Rico	Puerto Rico	genbank	genome	Lanciotti et al	https://www.ncbi.nlm.nih.gov/nuccore/KU501215	Phylogeny of Zika Virus in Western Hemisphere, 2015	Emerging Infect. Dis. 22 (5), 933-935 (2016)	https://www.ncbi.nlm.nih.gov/pubmed/27088323
 COL/FLR_00008/2015	zika	MF574562	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574562	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
diff --git a/tests/functional/filter/sequence_index.tsv b/tests/functional/filter/sequence_index.tsv
index 9ead2af42..16dafa0bd 100644
--- a/tests/functional/filter/sequence_index.tsv
+++ b/tests/functional/filter/sequence_index.tsv
@@ -10,4 +10,3 @@ DOM/2016/BB_0059	10035	2563	2089	2741	2015	621	6	0	0	0
 BRA/2016/FC_6706	10366	2747	2203	2915	2165	329	7	0	0	0
 DOM/2016/BB_0183	10621	2910	2343	3099	2269	0	0	0	0	0
 EcEs062_16	10812	2960	2388	3158	2306	0	0	0	0	0
-HND/2016/HU_ME59	10365	2842	2271	3016	2233	0	3	0	0	0
diff --git a/tests/functional/filter/sequences.fasta b/tests/functional/filter/sequences.fasta
index 3e37aa57f..8149494fc 100644
--- a/tests/functional/filter/sequences.fasta
+++ b/tests/functional/filter/sequences.fasta
@@ -1962,177 +1962,3 @@ ggcctgaactggagatcagctgtggatctccagaagagggactagtggttagaggagacc
 ccccggaaaacgcaaaacagcatattgacgctgggaaagaccagagactccatgagtttc
 caccacgctggccgccaggcacagatcgccgaatagcggcggccggtgtggggaaatcca
 tgggagatcgga
->HND/2016/HU_ME59
-gtttgaagcgaaagctagcaacagtatcaacaggttttattttggatttggaaacgagag
-tttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgcta
-aaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggactt
-ctgctgggtcatgggcccatcaggatggtcttggcgattctagcctttttgagattcacg
-gcaatcaagccatcactgggtctcatcaatagatggggttcagtggggaaaaaagaggct
-atggaaataataaagaagttcaagaaagatctggctgccatgctgagaataatcaatgct
-aggaaggagaagaagagacgaggcgcagatactagtgtcggaattgttggcctcctgctg
-accacagctatggcagcggaggtcactagacgtgggagtgcatactatatgtacttggac
-agaaacgatgctggggaggccatatcttttccaaccacattggggatgaataagtgttat
-atacagatcatggatcttggacacatgtgtgatgccaccatgagctatgaatgccctatg
-ctggatgagggggtggaaccagatgacgtcgattgttggtgcaacacgacgtcaacttgg
-gttgtgtacggaacctgccatcacaaaaaaggtgaagcacggagatctagaagagctgtg
-acgctcccctcccattccactaggaagctgcaaacgcggtcgcaaacctggttggaatca
-agagaatacacaaagcacttgattagagtcgaaaattggatattcaggaaccctggcttc
-gcgttagcagcagctgccatcgcttggcttttgggaagctcaacgagccaaaaagtcata
-tacttggtcatgatactgctgattgccccggcatacagcatcaggtgcataggagtcagc
-aatagggactttgtggaaggtatgtcaggtgggacttgggttgatgttgtcttggaacat
-ggaggttgtgtcaccgtaatggcacaggacaaaccgactgtcgacatagagctggttaca
-acaacagtcagcaacatggcggaggtaagatcctactgctatgaggcatcaatatcagac
-atggcttcggacagccgctgcccaacacaaggtgaagcctaccttgacaagcaatcagac
-actcaataygtctgcaaaagaacgttagtggacagaggctggggaaatggatgtggactt
-tttggcaaagggagcctggtgacatgcgctaagtttgcatgctccaagaaaatgaccggg
-aagagcatccagccagagaatctggagtaccggataatgctgtcagttcatggctcccag
-cacagtgggatgatcgttaatgacacaggacatgaaactgatgagaatagagcgaaggtt
-gagataacgcccawttcaccaagagccgaagccaccctggggggttttggaagcctagga
-cttgattgtgaaccgaggacaggccttgacttttcagatttgtattacttgactatgaat
-aacaagcactggttggttcacaaggagtggttccacgacattccattaccttggcacgct
-ggggcagacaccggaactccacactggaacaacaaagaagcactggtagagttcaaggac
-gcacatgccaaaaggcaaactgtcgtggttctagggagtcaagaaggagcagttcacacg
-gcccttgctggagctctggaggctgagatggatggtgcaaagggaaggctgtcctctggc
-cacttgaaatgtcgcctgaaaatggataaacttagattgaagggcgtgtcatactccttg
-tgtaccgcagcgttcacattcaccaagatcccggctgaaacactgcacgggacagtcaca
-gtggaggtacagtacgcagggacagatggaccttgcaaggttccagctcagatggcggtg
-gacatgcaaactctgaccccagttgggaggttgataaccgctaaccccgtaatcactgaa
-agcactgagaactctaagatgatgctggaacttgatccaccatttggggactcttacatt
-gtcataggagtcggggagaagaagatcacccaccactggcacaggagtggcagcaccatt
-ggaaaagcatttgaagccactgtgagaggtgccaagagaatggcagtcttgggagacaca
-gcctgggactttggatcagttggaggcgctctcaactcattgggcaagggcatccatcaa
-atttttggagcagctttcaaatcattgtttggaggaatgtcctggttctcacaaattctc
-attggaacgttgctgatgtggttgggtctgaacacaaagaatggatctatttcccttatg
-tgcttggccttagggggagtgttgatcttcttatccacagccgtctctgctgatgtgggg
-tgctcggtggacttctcaaagaaggagacgagatgcggtacaggggtgttcgtctataac
-gacgttgaagcctggagggacaggtacaagtaccatcctgactccccccgtagattggca
-gcagcagtcaagcaagcctgggaagatggtatctgcgggatctcctctgtttcaagaatg
-gaaaacatcatgtggagatcagtagaaggggagctcaacgcaatcctggaagagaatgga
-gttcaactgacggtcgttgtgggatctgtaaaaaaccccatgtggagagctccacagaga
-ttgcccgtgcctgtgaacgagctgccccacggctggaaggcttgggggaaatcgtacttc
-gtcagagcagcaaagacaaataacagctttgtcgtggatggtgacacactgaaggaatgc
-ccactcaaacatagagcatggaacagctttcttgtggaggatcatgggttcggggtattt
-cacactagtgtctggctcaaggttagagaagattattcattagagtgtgatccagccgtt
-attggaacagctgttaagggaaaggaggctgtacacagtgatctaggctactggattgag
-agtgagaagaatgacacatggaggctgaagagggcccatctgatcgagatgaaaacatgt
-gaatggccaaagtcccacacattgtggacagatggaatagaagagagtgatctgatcata
-cccaagtctttagctgggccactcagccatcacaataccagagagggctacaggacccaa
-atgaaagggccatggcacagtgaagagcttgaaattcggtttgaggaatgcccaggcact
-aaggtccacgtggaggaaacatgtggaacaagaggaccatctctgagatcaaccactgca
-agcggaagggtgatcgaggaatggtgctgcagggagtgcacaatgcccccactgtcgttc
-cgggctaaagatggctgttggtatggaatggagataaggcccaggaaagaaccagaaagc
-aacttagtaaggtcaatggtgactgcaggatcaactgatcacatggatcacttctccctt
-ggagtgcttgtgattctgctcatggtgcaggaagggctaaagaagagaatgaccacaaag
-atcatcataagcacatcaatggcagtgctggtagctatgatcctgggaggattttcaatg
-agtgacctggctaagcttgcaattttgatgggtgccaccttcgcggaaatgaacactgga
-ggagatgtagctcatctggcgctgatagcggcattcaaagtcagaccagcgttgctggta
-tctttcatcttcagagctaattggacaccccgtgaaagcatgctactggccttggcctcg
-tgtcttttgcaaactgcgatctccgccttggaaggcgacctgatggttctcatcaatggt
-tttgctttggcctggttggcaatacgagcgatggttgttccacgcactgataacatcacc
-ttggcaatcctggctgctctgacaccactggcccggggcacactgcttgtggcgtggaga
-gcaggccttgctacttgcggggggtttatgctcctctctctgaagggaaaaggcagtgtg
-aagaagaacttaccatttgtcatggccctgggactaaccgctgtgaggctggtcgacccc
-atcaacgtggtgggactgctgttgctcacaaggagtgggaagcggagctggccccctagc
-gaagtactcacagctgttggcctgatatgcgcattggctggagggttcgccaaggcagat
-atagagatggctgggcccatggccgcggtcggtctgctaattgtcagttacgtggtctca
-ggaaagagtgtggacatgtacattgaaagagcaggtgacatcacatgggaaaaagatgcg
-gaagtcactggaaacagtccccggctcgatgtggcgctagatgagagtggtgatttctcc
-ctggtggaggatgacggtccccccatgagagagatcatactcaaggtggtcctgatgacc
-atctgtggcatgaacccaatagccataccctttgcagctggagcgtggtacgtatacgtg
-aagactggaaaaaggagtggtgctctatgggatgtgcctgctcccaaggaagtaaaaaag
-ggggagaccacagatggagtgtacagagtaatgactcgtagactgctaggttcaacacaa
-gttggagtgggagtcatgcaagagggggtctttcacactatgtggcacgtcacaaaagga
-tccgcactgagaagcggtgaagggagacttgatccatactggggagatgtcaagcaggat
-ctggtgtcatactgtggtccatggaagctagatgccgcctgggacgggcacagcgaggtg
-cagctcctggccgtgccccccggagagagagcgaggaacatccagactctgcccggaata
-tttaagacaaaggatggggacattggagcggttgcgctggattacccagcaggaacttca
-ggatctccaatcctagacaagtgtgggagagtgataggactttatggcaatggggtcgtg
-atcaaaaatgggagttatgttagtgccatcacccaagggaggagggaggaagagactcct
-gttgagtgcttcgagccttcgatgctgaagaagaagcagctaactgtcttagacttacat
-cctggagctgggaaaaccaggagagttcttcctgaaatagtccgtgaagccataaaaaca
-agactccgtactgtgatcttagctccaaccagggttgtcgctgctgaaatggaggaggcc
-cttagagggcttccagtgcgttatatgacaacagcagtcaatgtcacccactctggaaca
-gaaatcgtcgacttaatgtgccatgccaccttcacttcacgtctactacagccaatcaga
-gtccccaactataatctgtatattatggatgaggcccacttcacagatccctcaagtata
-gcagcaagaggatacatttcaacaagggttgagatgggcgaggcggctgccatcttcatg
-accgccacgccaccaggaacccgtgacgcatttccggactccaactcaccaattatggac
-accgaagtggaagtcccagagagagcctggagctcaggctttgattgggtgacggatcat
-tctggaaaaacagtttggtttgttccaagcgtgaggaacggcaatgagatcgcagcttgt
-ctgacaaaggctggaaaacgggtcatacagctcagcagaaagacttttgagacagagttc
-cagaaaacaaaacatcaagagtgggactttgtcgtgacaactgacatttcagagatgggc
-gccaactttaaagctgaccgtgtcatagattccaggagrtgcctaaagccggtcatactt
-gatggcgagagagtcattctggctggacccatgcctgtcacacatgccagcgctgcccag
-aggagggggcgcataggcaggaatcccaacaaacctggagatgagtatctgtatggaggt
-gggtgcgcagagactgacgaagaccatgcacactggcttgaagcaagaatgctccttgac
-aatatttacctccaagatggcctcatagcctcgctctatcgacctgaggccgacaaagta
-gcagccattgagggagagttcaagcttaggacggagcaaaggaagacctttgtggaactc
-atgaaaagaggagatcttcctgtttggctggcctatcaggttgcatctgccggaataacc
-tacacagatagaagatggtgctttgatggcacgaccaacaacaccatactggaagacagt
-gtgccggcagaggtgtggaccagacacggagagaaaagagtgctcaaaccgaggtggatg
-gacgccagagtttgttcagatcatgcggccctgaagtcattcaaggagtttgccgctggg
-aaaagaggagcggcttttggagtgatggaagccctgggaacactgccaggacacatgaca
-gagagattccaggaagccattgacaacctcgctgtgctcatgcgggcagagactggaagc
-aggccttacaaagccgcggcggcccaattgccggagaccctagagaccattatgcttttg
-gggttgctgggaacagtctcgctgggaatctttttcgtcttgatgaggaacaagggcata
-gggaagatgggctttggaatggtgacccttggggccagtgcatggctcatgtggctctcg
-gaaattgagccagccagaattgcatgtgtcctcattgttgtgttcctattgctggtggtg
-ctcatacctgagccagaaaagcaaagatctccccaggacaaccaaatggcaatcatcatc
-atggtagcagtaggtcttctgggcttgattaccgccaatgaactcggatggttggagaga
-acaaagagtgacctaagccatctgatgggaaggagagaggagggggcaaccataggattc
-tcaatggacattgacctgcggccagcctcagcttgggccatctatgctgccttgacaact
-ttcattaccccagccgtccaacatgcagtgaccacttcatacaacaactactccttaatg
-gcgatggccacgcaagctggagtgttgtttggtatgggcaaagggatgccattctacgca
-tgggactttggagtcccgctgctaatgataggttgctactcacaattaacacccctgacc
-ctaatagtggccatcattttgctcgtggcgcactacatgtacttgatcccagggctgcag
-gcagcagctgcgcgtgctgcccagaagagaacggcagctggcatcatgaagaaccctgtt
-gtggatggaatagtggtgactgacattgacacaatgacaattgacccccaagtggagaaa
-aagatgggacaggtgctactcatagcagtagccgtctccagcgccatactgtcgcggacc
-gcctgggggtggggggaggctggggccctgatcacagccgcaacttccactttgtgggaa
-ggctctccgaacaagtactggaactcctctacagccacttcactgtgtaacatttttagg
-ggaagttacttggctggagcttctctaatctacacagtaacaagaaacgctggcttggtc
-aagagacgtgggggtggaacaggagagaccctgggagagaaatggaaggcccgcttgaac
-cagatgtcggccctggagttctactcctacaaaaagtcaggcatcaccgaggtgtgcaga
-gaagaggcccgccgcgccctcaaggacggtgtggcaacgggaggccatgctgtgtcccga
-ggaagtgcaaagctgagatggttggtggagcggggatacctgcagccctatggaaaggtc
-attgatcttggatgtggcagagggggctggagttactacgccgccaccatccgcaaagtt
-caagaagtgaaaggatacacaaaaggaggccctggtcatgaagaacccgtgttggtgcaa
-agctatgggtggaacatagtccgtcttaagagtggggtggacgtctttcatatggcggct
-gagccgtgtgacacgttgctgtgtgacataggtgagtcatcatctagtcctgaagtggaa
-gaagcacggacgctcagagtcctctccatggtgggggattggcttgaaaaaagaccagga
-gccttttgtataaaagtgttgtgcccatacaccagcactatgatggaaaccctggagcga
-ctgcagcgtaggtatgggggaggactggtcagagtgccactctcccgcaactctacacat
-gagatgtactgggtctctggagcgaaaagcaacaccataaaaagtgtgtccaccacgagc
-cagctcctcttggggcgcatggacgggcctaggaggccagtgaaatatgaggaggatgtg
-aatctcggctctggcacgcgggctgtggtaagctgcgctgaagctcccaacatgaagatc
-attggtaaccgcattgaaaggatccgcagtgagcacgcggaaacgtggttctttgacgag
-aaccacccatataggacatgggcttaccatggaagctatgaggcccccacacaagggtca
-gcgtcctctctaataaacggggttgtcaggctcctgtcaaaaccctgggatgtggtgact
-ggagtcacaggaatagccatgaccgacaccacaccgtatggtcagcaaagagttttcaag
-gaaaaagtggacactagggtgccagacccccaagaaggcactcgtcaggttatgagcatg
-gtctcttcctggttgtggaaagagctaggcaaacacaaacggccacgagtctgtaccaaa
-gaagagttcatcaacaaggttcgtagcaatgcagcattaggggcaatatttgaagaggaa
-aaagagtggaagactgcagtggaagctgtgaacgatccaaggttctgggctctagtggac
-aaggaaagagagcaccacctgagaggagagtgccagagttgtgtgtacaacatgatggga
-aaaagagaaaagaaacaaggggaatttggaaaggccaagggcagccgcgccatctggtat
-atgtggctaggggctagatttctagagttcgaagcccttggattcttgaacgaggatcac
-tggatggggagagagaactcaggaggtggtgttgaagggctgggattacaaagactcgga
-tatgtcctagaagagatgagttgcataccaggaggaaggatgtatgcagatgacactgct
-ggctgggacacccgcatcagcaggtttgatctggagaatgaagctctaatcaccaaccaa
-atggagaaagggcacagggccttggcattggccataatcaagtacacataccaaaacaaa
-gtggtaaaggtccttagaccagctgaaaaagggaaaacagttatggacattatttcgaga
-caagaccaaagggggagcggacaagttgtcacttacgctcttaacacatttaccaaccta
-gtggtgcaactcatccggaatatggaggctgaggaagttctagagatgcaagacttgtgg
-ctgctgcggaggtcagagaaagtgaccaactggttgcagagcaacggatgggataggctc
-aaacgaatggcagtcagtggagatgattgcgttgtgaagccaattgatgataggtttgca
-catgccctcaggttcttgaatgatatgggaaaagttaggaaggacacacaagagtggaaa
-ccctcaactggatgggacaactgggaagaagttccgttttgctcccaccacttcaacaag
-ctccatctcaaggacgggaggtccattgtggttccctgccgccaccaagatgaactgatt
-ggccgggcccgcgtctctccaggggcgggatggagcatccgggagactgcttgcctagca
-aaatcatatgcgcaaatgtggcagctcctttatttccacagaagggacctccgactgatg
-gccaatgccatttgttcatctgtgccagttgactgggttccaactgggagaactacctgg
-tcaatccatggaaagggagaatggatgaccactgaagacatgcttgtggtgtggaacaga
-gtgtggattgaggagaacgaccacatggaagacaagaccccagttacgaaatggacagac
-attccctatttgggaaaaagggaagacttgtggtgtggatctctcatagggcacagaccg
-cgcaccacctgggctgagaacattaaaaacacagtcaacatggtgcgcaggatcataggt
-gatgaagaaaagtacatggactacctatccacccaagttcgctacttgggtgaagaaggg
-tctacacctggagtgctgtaagcaccaatcttaatgttgtcaggc
diff --git a/tests/functional/filter/tb.vcf.gz b/tests/functional/filter/tb.vcf.gz
new file mode 100644
index 000000000..1eb0eeba4
Binary files /dev/null and b/tests/functional/filter/tb.vcf.gz differ
diff --git a/tests/functional/filter/tb_metadata.tsv b/tests/functional/filter/tb_metadata.tsv
new file mode 100644
index 000000000..20eb69dc8
--- /dev/null
+++ b/tests/functional/filter/tb_metadata.tsv
@@ -0,0 +1,166 @@
+strain	accession	date	region	country	location	db	authors	cluster	paper_url	title
+G22670	10155	1991-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-I	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22671	10223	1992-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22672	11011	1991-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-I	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22673	11234	1992-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-I	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22674	14069	1993-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-I	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22675	14508	1993-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22676	15613	1994-XX-XX	north_america	canada	village_e	genbank	Lee et al	Mj-V.d	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22677	16490	1995-XX-XX	north_america	canada	village_e	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22678	16493	1995-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-I	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22679	18421	1996-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-I	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22680	18422	1996-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-I	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22681	18747	1996-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-II	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22682	18988	1996-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-II	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22683	19057	1996-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-II	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22684	19276	1996-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-II	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22685	50045	1996-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22686	50179	1997-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-II	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22687	50248	1997-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22688	53221	1998-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22689	54902	1999-XX-XX	north_america	canada	other_village	genbank	Lee et al	Mj-IV.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22690	55546	2000-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22691	55753	2000-XX-XX	north_america	canada	village_b	genbank	Lee et al	Mj-VI	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22692	55988	2005-XX-XX	north_america	canada	village_b	genbank	Lee et al	Mj-VI	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22693	55989	2000-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22694	56828	2000-XX-XX	north_america	canada	other_village	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22695	57052	2001-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-II	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22696	58385	2001-XX-XX	north_america	canada	other_village	genbank	Lee et al	NA	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22697	60053	2002-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22698	62796	2004-XX-XX	north_america	canada	village_e	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22699	62806	2004-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22700	62957	2004-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22701	63113	2004-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22702	63670	2004-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-II	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22703	63878	2004-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22704	64165	2004-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-II	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22705	64334	2005-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22706	64712	2005-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22707	65165	2005-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22708	66591	2006-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22709	68995	2007-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22710	73787	2010-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22711	74856	2010-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22712	78932	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22713	79031	2012-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22669	9965	1991-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22589	MT-0080	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22668	MT-0712	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22582	MT-0718	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22587	MT-0972	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22590	MT-1103	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22591	MT-1128	2011-XX-XX	north_america	canada	village_b	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22592	MT-1167	2008-XX-XX	north_america	canada	village_b	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22593	MT-1206	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22666	MT-1212	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22594	MT-1247	2012-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22714	MT-13-1408	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22715	MT-13-1711	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22716	MT-13-1712	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22717	MT-13-1753	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22718	MT-13-1828	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22719	MT-13-1835	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22720	MT-13-1892	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22721	MT-13-2012	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22722	MT-13-2334	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22723	MT-13-2384	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22725	MT-13-2690	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22726	MT-13-2761	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22727	MT-13-3209	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22728	MT-13-848	2013-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mn	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22595	MT-131	2011-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22596	MT-1336	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22597	MT-1345	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22598	MT-1393	2008-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22599	MT-140	2009-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22600	MT-1403	2009-XX-XX	north_america	canada	village_e	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22601	MT-1466	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22602	MT-1499	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22603	MT-1549	2008-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22604	MT-1605	2008-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22605	MT-1684	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22606	MT-1799	2011-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22607	MT-1838	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22608	MT-1971	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22609	MT-2151	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22610	MT-2174	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22611	MT-2175	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22612	MT-2178	2011-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22613	MT-2184	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22614	MT-2224	2008-XX-XX	north_america	canada	village_e	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22615	MT-2356	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22616	MT-2465	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22617	MT-2473	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22618	MT-2474	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22619	MT-2538	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.d	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22620	MT-2665	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22621	MT-2667	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22622	MT-2706	2010-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22623	MT-2720	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22624	MT-2762	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22625	MT-2768	2009-XX-XX	north_america	canada	village_e	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22626	MT-2769	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22627	MT-2771	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22628	MT-2792	2006-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22629	MT-2800	2010-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22631	MT-289	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22632	MT-2905	2006-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22633	MT-2910	2006-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22634	MT-2931	2006-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22635	MT-3000	2009-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22636	MT-3004	2009-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22637	MT-3074	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22638	MT-3173	2009-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22639	MT-3194	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22640	MT-3239	2006-XX-XX	north_america	canada	other_village	genbank	Lee et al	Mj-V.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22641	MT-3255	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22642	MT-3271	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22643	MT-3281	2010-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22644	MT-3296	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22645	MT-3341	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22648	MT-3673	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22649	MT-3683	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22651	MT-3787	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22652	MT-389	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22653	MT-393	2009-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22729	MT-398	2005-XX-XX	north_america	canada	other_village	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22654	MT-405	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22655	MT-4067	2007-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22656	MT-4137	2010-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22667	MT-4166	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22730	MT-4230	2010-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22657	MT-441	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22731	MT-4466	2011-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22658	MT-452	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22659	MT-467	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22660	MT-4683	2011-XX-XX	north_america	canada	village_d	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22661	MT-4846	2011-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22662	MT-4854	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22663	MT-4884	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22664	MT-4942	2007-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22665	MT-504	2011-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22565	MT-5195	2007-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22566	MT-5337	2007-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22567	MT-5373	2009-XX-XX	north_america	canada	other_village	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22568	MT-5383	2007-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22569	MT-5447	2011-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22732	MT-5488	2007-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22570	MT-5531	2011-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22571	MT-5543	2007-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22572	MT-567	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22733	MT-578	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22573	MT-5870	2009-XX-XX	north_america	canada	other_village	genbank	Lee et al	Mj-IV.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22574	MT-5983	2011-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22575	MT-6084	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22576	MT-6205	2011-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-V.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22577	MT-6218	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22578	MT-6226	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22579	MT-6429	2012-XX-XX	north_america	canada	village_k	genbank	Lee et al	Mj-III.b	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22580	MT-661	2009-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22581	MT-692	2009-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22583	MT-721	2008-XX-XX	north_america	canada	village_a	genbank	Lee et al	Mj-III.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22584	MT-751	2008-XX-XX	north_america	canada	village_b	genbank	Lee et al	Mj-V.a	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22585	MT-853	2011-XX-XX	north_america	canada	village_c	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22586	MT-877	2010-XX-XX	north_america	canada	other_village	genbank	Lee et al	Mj-IV.c	http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit 
+G22650	None1	20XX-XX-XX	north_america	canada	?	genbank	Lee et al		http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
+G22724	None2	20XX-XX-XX	north_america	canada	?	genbank	Lee et al		http://www.pnas.org/content/112/44/13609	Population Genomics of Mycobacterium tuberculosis in the Inuit
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 264bcc643..a7d439804 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,4 +1,5 @@
 import datetime
+from pathlib import Path
 from unittest.mock import patch
 
 import pytest
@@ -129,3 +130,18 @@ def test_is_date_ambiguous(self):
         # Test incomplete date strings without ambiguous dates for the requested fields.
         assert not utils.is_date_ambiguous("2019", "year")
         assert not utils.is_date_ambiguous("2019-10", "month")
+
+    def test_read_strains(self, tmpdir):
+        # Write one list of filenames with some unnecessary whitespace.
+        strains1 = Path(tmpdir) / Path("strains1.txt")
+        with open(strains1, "w") as oh:
+            oh.write("strain1 # this is an inline comment about strain 1\nstrain2\n   # this is a comment preceded by whitespace.\n")
+
+        # Write another list of filenames with a comment.
+        strains2 = Path(tmpdir) / Path("strains2.txt")
+        with open(strains2, "w") as oh:
+            oh.write("# this is a comment. ignore this.\nstrain2\nstrain3\n")
+
+        strains = utils.read_strains(strains1, strains2)
+        assert len(strains) == 3
+        assert "strain1" in strains