diff --git a/augur/filter.py b/augur/filter.py index 39e909cd3..6e08f453e 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -14,7 +14,7 @@ import treetime.utils from .index import index_sequences -from .utils import read_metadata, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous +from .utils import read_metadata, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous comment_char = '#' MAX_NUMBER_OF_PROBABILISTIC_SAMPLING_ATTEMPTS = 10 @@ -92,32 +92,49 @@ def filter_by_query(sequences, metadata_file, query): return [seq for seq in sequences if seq in filtered_meta_dict] def register_arguments(parser): - parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta or VCF format") - parser.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV") - parser.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.") - parser.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") - parser.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") - parser.add_argument('--min-length', type=int, help="minimal length of the sequences") - parser.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters") - parser.add_argument('--exclude', type=str, help="file with list of strains that are to be excluded") - parser.add_argument('--include', type=str, help="file with list of strains that are to be included regardless of priorities or subsampling") - parser.add_argument('--priority', type=str, help="file with list of priority scores for sequences (strain\tpriority)") - subsample_group = parser.add_mutually_exclusive_group() - subsample_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category") - subsample_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences") - parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist") - probabilistic_sampling_group = parser.add_mutually_exclusive_group() - probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.") - probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling') - parser.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.") - parser.add_argument('--exclude-where', nargs='+', + input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered") + input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV") + input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format") + input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.") + + metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata") + metadata_filter_group.add_argument( + '--query', + help="""Filter samples by attribute. + Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax. + (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")""" + ) + metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") + metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") + metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], + help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").') + metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude") + metadata_filter_group.add_argument('--exclude-where', nargs='+', help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND") - parser.add_argument('--include-where', nargs='+', + metadata_filter_group.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.") + metadata_filter_group.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains to include regardless of priorities or subsampling") + metadata_filter_group.add_argument('--include-where', nargs='+', help="Include samples with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be included), not AND. This rule is applied last and ensures any sequences matching these rules will be included.") - parser.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], - help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").') - parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.") - parser.add_argument('--output', '-o', help="output file", required=True) + metadata_filter_group.add_argument('--priority', type=str, help="file with list of priority scores for strains (strain\tpriority)") + + sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data") + sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences") + sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters") + + subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data") + subsample_group.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist") + subsample_limits_group = subsample_group.add_mutually_exclusive_group() + subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category") + subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences") + probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group() + probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.") + probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling') + subsample_group.add_argument('--subsample-seed', help="random number generator seed to allow reproducible sub-sampling (with same input data). Can be number or string.") + + output_group = parser.add_argument_group("outputs", "possible representations of filtered data (at least one required)") + output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format") + output_group.add_argument('--output-metadata', help="metadata for strains that passed filters") + output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)") parser.set_defaults(probabilistic_sampling=True) @@ -125,10 +142,48 @@ def run(args): ''' filter and subsample a set of sequences into an analysis set ''' + # Validate arguments before attempting any I/O. + # Don't allow sequence output when no sequence input is provided. + if args.output and not args.sequences: + print( + "ERROR: You need to provide sequences to output sequences.", + file=sys.stderr) + return 1 + + # Confirm that at least one output was requested. + if not any((args.output, args.output_metadata, args.output_strains)): + print( + "ERROR: You need to select at least one output.", + file=sys.stderr) + return 1 + + # Don't allow filtering on sequence-based information, if no sequences or + # sequence index is provided. + SEQUENCE_ONLY_FILTERS = [ + args.min_length, + args.non_nucleotide + ] + if not args.sequences and not args.sequence_index and any(SEQUENCE_ONLY_FILTERS): + print( + "ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.", + file=sys.stderr) + return 1 + + # Load inputs, starting with metadata. + try: + # Metadata are the source of truth for which sequences we want to keep + # in filtered output. + meta_dict, meta_columns = read_metadata(args.metadata) + metadata_strains = set(meta_dict.keys()) + except ValueError as error: + print("ERROR: Problem reading in {}:".format(args.metadata)) + print(error) + return 1 + #Set flags if VCF is_vcf = False is_compressed = False - if any([args.sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): + if args.sequences and any([args.sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): is_vcf = True if args.sequences.lower().endswith('.gz'): is_compressed = True @@ -142,12 +197,13 @@ def run(args): "Please see the augur install instructions to install it.") return 1 - ####Read in files + # Read in files - #If VCF, open and get sequence names + # If VCF, open and get sequence names if is_vcf: - seq_keep, all_seq = read_vcf(args.sequences) - else: + vcf_sequences, _ = read_vcf(args.sequences) + sequence_strains = set(vcf_sequences) + elif args.sequences or args.sequence_index: # If FASTA, try to load the sequence composition details and strain # names to be filtered. index_is_autogenerated = False @@ -180,45 +236,46 @@ def run(args): # Calculate summary statistics needed for filtering. sequence_index["ACGT"] = sequence_index.loc[:, ["A", "C", "G", "T"]].sum(axis=1) - seq_keep = sequence_index["strain"].values - all_seq = seq_keep.copy() + sequence_strains = set(sequence_index["strain"].values) + else: + sequence_strains = None - try: - meta_dict, meta_columns = read_metadata(args.metadata) - except ValueError as error: - print("ERROR: Problem reading in {}:".format(args.metadata)) - print(error) - return 1 + if sequence_strains is not None: + # Calculate the number of strains that don't exist in either metadata or sequences. + num_excluded_by_lack_of_metadata = len(sequence_strains - metadata_strains) + num_excluded_by_lack_of_sequences = len(metadata_strains - sequence_strains) + + # Intersect sequence strain names with metadata strains. + available_strains = metadata_strains & sequence_strains + else: + num_excluded_by_lack_of_metadata = None + num_excluded_by_lack_of_sequences = None + + # When no sequence data are available, we treat the metadata as the + # source of truth. + available_strains = metadata_strains + # Track the strains that are available to select by the filters below, after + # accounting for availability of metadata and sequences. + seq_keep = available_strains.copy() ##################################### #Filtering steps ##################################### - # remove sequences without meta data - tmp = [ ] - for seq_name in seq_keep: - if seq_name in meta_dict: - tmp.append(seq_name) - else: - print("No meta data for %s, excluding from all further analysis."%seq_name) - seq_keep = tmp + # Exclude all strains by default. + if args.exclude_all: + num_excluded_by_all = len(available_strains) + seq_keep = set() # remove strains explicitly excluded by name # read list of strains to exclude from file and prune seq_keep num_excluded_by_name = 0 if args.exclude: try: - with open(args.exclude, 'r', encoding='utf-8') as ifile: - to_exclude = set() - for line in ifile: - if line[0] != comment_char: - # strip whitespace and remove all text following comment character - exclude_name = line.split(comment_char)[0].strip() - to_exclude.add(exclude_name) - tmp = [seq_name for seq_name in seq_keep if seq_name not in to_exclude] - num_excluded_by_name = len(seq_keep) - len(tmp) - seq_keep = tmp + to_exclude = read_strains(*args.exclude) + num_excluded_by_name = len(seq_keep & to_exclude) + seq_keep = seq_keep - to_exclude except FileNotFoundError as e: print("ERROR: Could not open file of excluded strains '%s'" % args.exclude, file=sys.stderr) sys.exit(1) @@ -241,15 +298,15 @@ def run(args): else: # i.e. property=value requested if meta_dict[seq_name].get(col,'unknown').lower() == val.lower(): to_exclude.add(seq_name) - tmp = [seq_name for seq_name in seq_keep if seq_name not in to_exclude] - num_excluded_by_metadata[ex] = len(seq_keep) - len(tmp) - seq_keep = tmp + + num_excluded_by_metadata[ex] = len(seq_keep & to_exclude) + seq_keep = seq_keep - to_exclude # exclude strains by metadata, using Pandas querying num_excluded_by_query = 0 if args.query: - filtered = filter_by_query(seq_keep, args.metadata, args.query) - num_excluded_by_query = len(seq_keep) - len(filtered) + filtered = set(filter_by_query(list(seq_keep), args.metadata, args.query)) + num_excluded_by_query = len(seq_keep - filtered) seq_keep = filtered # filter by sequence length @@ -261,9 +318,11 @@ def run(args): is_in_seq_keep = sequence_index["strain"].isin(seq_keep) is_gte_min_length = sequence_index["ACGT"] >= args.min_length - seq_keep_by_length = sequence_index[ - (is_in_seq_keep) & (is_gte_min_length) - ]["strain"].tolist() + seq_keep_by_length = set( + sequence_index[ + (is_in_seq_keep) & (is_gte_min_length) + ]["strain"].tolist() + ) num_excluded_by_length = len(seq_keep) - len(seq_keep_by_length) seq_keep = seq_keep_by_length @@ -271,10 +330,10 @@ def run(args): # filter by ambiguous dates num_excluded_by_ambiguous_date = 0 if args.exclude_ambiguous_dates_by and 'date' in meta_columns: - seq_keep_by_date = [] + seq_keep_by_date = set() for seq_name in seq_keep: if not is_date_ambiguous(meta_dict[seq_name]['date'], args.exclude_ambiguous_dates_by): - seq_keep_by_date.append(seq_name) + seq_keep_by_date.add(seq_name) num_excluded_by_ambiguous_date = len(seq_keep) - len(seq_keep_by_date) seq_keep = seq_keep_by_date @@ -283,11 +342,11 @@ def run(args): num_excluded_by_date = 0 if (args.min_date or args.max_date) and 'date' in meta_columns: dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d") - tmp = [s for s in seq_keep if dates[s] is not None] + tmp = {s for s in seq_keep if dates[s] is not None} if args.min_date: - tmp = [s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date] + tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date} if args.max_date: - tmp = [s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s]) 0 - ] - ) + if args.include: + # Collect the union of all given strains to include. + to_include = read_strains(*args.include) - for s in to_include: - if s not in seq_keep: - seq_keep.append(s) - num_included_by_name += 1 + # Find requested strains that can be included because they have metadata + # and sequences. + available_to_include = available_strains & to_include + + # Track the number of strains that could and could not be included. + num_included_by_name = len(available_to_include) + num_not_included_by_name = len(to_include - available_to_include) + + # Union the strains that can be included with the sequences to keep. + seq_keep = seq_keep | available_to_include # add sequences with particular meta data attributes num_included_by_metadata = 0 if args.include_where: - to_include = [] + to_include = set() + for ex in args.include_where: try: col, val = ex.split("=") @@ -453,50 +528,90 @@ def run(args): continue # loop over all sequences and re-add sequences - for seq_name in all_seq: - if seq_name in meta_dict: - if meta_dict[seq_name].get(col)==val: - to_include.append(seq_name) - else: - print("WARNING: no metadata for %s, skipping"%seq_name) - continue - - for s in to_include: - if s not in seq_keep: - seq_keep.append(s) - num_included_by_metadata += 1 + for seq_name in available_strains: + if meta_dict[seq_name].get(col)==val: + to_include.add(seq_name) + + num_included_by_metadata = len(to_include) + seq_keep = seq_keep | to_include + + # Write output starting with sequences, if they've been requested. It is + # possible for the input sequences and sequence index to be out of sync + # (e.g., the index is a superset of the given sequences input), so we need + # to update the set of strains to keep based on which strains are actually + # available. + if is_vcf and args.output: + # Get the samples to be deleted, not to keep, for VCF + dropped_samps = list(available_strains - seq_keep) + write_vcf(args.sequences, args.output, dropped_samps) + elif args.sequences and args.output: + sequences = SeqIO.parse(args.sequences, "fasta") - ####Write out files + # Stream to disk all sequences that passed all filters to avoid reading + # sequences into memory first. Track the observed strain names in the + # sequence file as part of the single pass to allow comparison with the + # provided sequence index. + observed_sequence_strains = set() + with open(args.output, "w") as output_handle: + for sequence in sequences: + observed_sequence_strains.add(sequence.id) + + if sequence.id in seq_keep: + SeqIO.write(sequence, output_handle, 'fasta') + + if sequence_strains != observed_sequence_strains: + # Warn the user if the expected strains from the sequence index are + # not a superset of the observed strains. + if not observed_sequence_strains <= sequence_strains: + print( + "WARNING: The sequence index is out of sync with the provided sequences.", + "Augur will only output strains with available sequences.", + file=sys.stderr + ) + + # Update the set of available sequence strains and which of these + # strains passed filters. This prevents writing out strain lists or + # metadata for strains that have no sequences. + sequence_strains = observed_sequence_strains + seq_keep = seq_keep & sequence_strains + + # Calculate the number of strains that don't exist in either + # metadata or sequences. + num_excluded_by_lack_of_metadata = len(sequence_strains - metadata_strains) + num_excluded_by_lack_of_sequences = len(metadata_strains - sequence_strains) + + if args.output_metadata: + metadata_df = pd.DataFrame([meta_dict[strain] for strain in seq_keep]) + metadata_df.to_csv( + args.output_metadata, + sep="\t", + index=False + ) - if is_vcf: - #get the samples to be deleted, not to keep, for VCF - dropped_samps = list(set(all_seq) - set(seq_keep)) - if len(dropped_samps) == len(all_seq): #All samples have been dropped! Stop run, warn user. - print("ERROR: All samples have been dropped! Check filter rules and metadata file format.") - return 1 - write_vcf(args.sequences, args.output, dropped_samps) + if args.output_strains: + with open(args.output_strains, "w") as oh: + for strain in sorted(seq_keep): + oh.write(f"{strain}\n") + # Calculate the number of strains passed and filtered. + if sequence_strains is not None: + all_strains = metadata_strains | sequence_strains else: - # It should not be possible to have ids in the list of sequences to keep - # that do not exist in the original input sequences, since we built this - # list of ids from the sequence index. Just to be safe though, we find - # the intersection of these two lists of ids to determine if all samples - # were dropped or not. This final list of ids is in the same order as - # the input sequences such that output sequences are always in the same - # order for a given set of filters. - sequences = SeqIO.parse(args.sequences, "fasta") - sequences_to_write = (sequence for sequence in sequences if sequence.id in seq_keep) + all_strains = metadata_strains - # Write out sequences that passed all filters using an iterator to - # ensure that sequences are streamed to disk without being read into - # memory first. - sequences_written = SeqIO.write(sequences_to_write, args.output, 'fasta') + total_strains_passed = len(seq_keep) + total_strains_filtered = len(all_strains) - total_strains_passed - if sequences_written == 0: - print("ERROR: All samples have been dropped! Check filter rules and metadata file format.", file=sys.stderr) - return 1 + print(f"{total_strains_filtered} strains were dropped during filtering") + + if num_excluded_by_lack_of_sequences: + print(f"\t{num_excluded_by_lack_of_sequences} had no sequence data") + + if num_excluded_by_lack_of_metadata: + print(f"\t{num_excluded_by_lack_of_metadata} had no metadata") - print("\n%i sequences were dropped during filtering" % (len(all_seq) - len(seq_keep),)) + if args.exclude_all: + print(f"\t{num_excluded_by_all} of these were dropped by `--exclude-all`") if args.exclude: print("\t%i of these were dropped because they were in %s" % (num_excluded_by_name, args.exclude)) if args.exclude_where: @@ -516,12 +631,19 @@ def run(args): seed_txt = ", using seed {}".format(args.subsample_seed) if args.subsample_seed else "" print("\t%i of these were dropped because of subsampling criteria%s" % (num_excluded_subsamp, seed_txt)) - if args.include and os.path.isfile(args.include): - print("\n\t%i sequences were added back because they were in %s" % (num_included_by_name, args.include)) + if args.include: + print(f"\n\t{num_included_by_name} strains were added back because they were requested by include files") + + if num_not_included_by_name: + print(f"\t{num_not_included_by_name} strains from include files were not added because they lacked sequence or metadata") if args.include_where: print("\t%i sequences were added back because of '%s'" % (num_included_by_metadata, args.include_where)) - print("%i sequences have been written out to %s" % (len(seq_keep), args.output)) + if total_strains_passed == 0: + print("ERROR: All samples have been dropped! Check filter rules and metadata file format.", file=sys.stderr) + return 1 + + print(f"{total_strains_passed} strains passed all filters") def _filename_gz(filename): diff --git a/augur/utils.py b/augur/utils.py index 5fb1ed923..b68c484e3 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -686,3 +686,38 @@ def load_mask_sites(mask_file): "A", "G", "C", "T", "U", "N", "R", "Y", "S", "W", "K", "M", "B", "V", "D", "H", "-", "a", "g", "c", "t", "u", "n", "r", "y", "s", "w", "k", "m", "b", "v", "d", "h", "-" } + + +def read_strains(*files, comment_char="#"): + """Reads strain names from one or more plain text files and returns the + set of distinct strains. + + Strain names can be commented with full-line or inline comments. For + example, the following is a valid strain names file: + + # this is a comment at the top of the file + strain1 # exclude strain1 because it isn't sequenced properly + strain2 + # this is an empty line that will be ignored. + + Parameters + ---------- + files : one or more str + one or more names of text files with one strain name per line + + Returns + ------- + set : + strain names from the given input files + + """ + strains = set() + for input_file in files: + with open(input_file, 'r', encoding='utf-8') as ifile: + for line in ifile: + # Allow comments anywhere in a given line. + strain_name = line.split(comment_char)[0].strip() + if len(strain_name) > 0: + strains.add(strain_name) + + return strains diff --git a/tests/builds/zika.t b/tests/builds/zika.t index c662cb1a2..5e337732b 100644 --- a/tests/builds/zika.t +++ b/tests/builds/zika.t @@ -43,8 +43,8 @@ Filter sequences by a minimum date and an exclusion list and only keep one seque > --subsample-seed 314159 \ > --no-probabilistic-sampling \ > --min-date 2012 > /dev/null - - $ diff -u "results/filtered.fasta" "$TMP/out/filtered.fasta" + $ grep "^>" "$TMP/out/filtered.fasta" | wc -l + \s*10 (re) Align filtered sequences to a specific reference sequence and fill any gaps. diff --git a/tests/functional/filter.t b/tests/functional/filter.t index 65e6877bb..3d15b2bbe 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -33,7 +33,7 @@ This should fail, as probabilistic sampling is explicitly disabled. > --subsample-seed 314159 \ > --no-probabilistic-sampling \ > --output "$TMP/filtered.fasta" - ERROR: Asked to provide at most 5 sequences, but there are 10 groups. + ERROR: Asked to provide at most 5 sequences, but there are 8 groups. [1] $ rm -f "$TMP/filtered.fasta" @@ -63,3 +63,188 @@ Using the default probabilistic subsampling, should work the same as the previou > --subsample-seed 314159 \ > --output "$TMP/filtered.fasta" > /dev/null $ rm -f "$TMP/filtered.fasta" + +Filter using only metadata without sequence input or output and save results as filtered metadata. + + $ ${AUGUR} filter \ + > --sequence-index filter/sequence_index.tsv \ + > --metadata filter/metadata.tsv \ + > --min-date 2012 \ + > --min-length 10500 \ + > --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null + +Output should include the 8 sequences matching the filters and a header line. + + $ wc -l "$TMP/filtered_metadata.tsv" + \s*9 .* (re) + $ rm -f "$TMP/filtered_metadata.tsv" + +Filter using only metadata and save results as a list of filtered strains. + + $ ${AUGUR} filter \ + > --sequence-index filter/sequence_index.tsv \ + > --metadata filter/metadata.tsv \ + > --min-date 2012 \ + > --min-length 10500 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + +Output should include only the 8 sequences matching the filters (without a header line). + + $ wc -l "$TMP/filtered_strains.txt" + \s*8 .* (re) + $ rm -f "$TMP/filtered_strains.txt" + +Filter using only metadata without a sequence index. +This should work because the requested filters don't rely on sequence information. + + $ ${AUGUR} filter \ + > --metadata filter/metadata.tsv \ + > --min-date 2012 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + $ rm -f "$TMP/filtered_strains.txt" + +Try to filter using only metadata without a sequence index. +This should fail because the requested filters rely on sequence information. + + $ ${AUGUR} filter \ + > --metadata filter/metadata.tsv \ + > --min-length 10000 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information. + [1] + +Try to filter with sequence outputs and no sequence inputs. +This should fail. + + $ ${AUGUR} filter \ + > --sequence-index filter/sequence_index.tsv \ + > --metadata filter/metadata.tsv \ + > --min-length 10000 \ + > --output "$TMP/filtered.fasta" > /dev/null + ERROR: You need to provide sequences to output sequences. + [1] + +Try to filter without any outputs. + + $ ${AUGUR} filter \ + > --sequence-index filter/sequence_index.tsv \ + > --metadata filter/metadata.tsv \ + > --min-length 10000 > /dev/null + ERROR: You need to select at least one output. + [1] + +Filter into two separate sets and then select sequences from the union of those sets. +First, select strains from Brazil (there should be 1). + + $ ${AUGUR} filter \ + > --metadata filter/metadata.tsv \ + > --query "country == 'Brazil'" \ + > --output-strains "$TMP/filtered_strains.brazil.txt" > /dev/null + $ wc -l "$TMP/filtered_strains.brazil.txt" + \s*1 .* (re) + +Then, select strains from Colombia (there should be 3). + + $ ${AUGUR} filter \ + > --metadata filter/metadata.tsv \ + > --query "country == 'Colombia'" \ + > --output-strains "$TMP/filtered_strains.colombia.txt" > /dev/null + $ wc -l "$TMP/filtered_strains.colombia.txt" + \s*3 .* (re) + +Finally, exclude all sequences except those from the two sets of strains (there should be 4). + + $ ${AUGUR} filter \ + > --sequences filter/sequences.fasta \ + > --sequence-index filter/sequence_index.tsv \ + > --metadata filter/metadata.tsv \ + > --exclude-all \ + > --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \ + > --output "$TMP/filtered.fasta" > /dev/null + $ grep "^>" "$TMP/filtered.fasta" | wc -l + \s*4 (re) + $ rm -f "$TMP/filtered.fasta" + +Alternately, exclude only the sequences from Brazil and Colombia (12 - 4 strains). + + $ ${AUGUR} filter \ + > --sequences filter/sequences.fasta \ + > --sequence-index filter/sequence_index.tsv \ + > --metadata filter/metadata.tsv \ + > --exclude "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \ + > --output "$TMP/filtered.fasta" > /dev/null + $ grep "^>" "$TMP/filtered.fasta" | wc -l + \s*6 (re) + $ rm -f "$TMP/filtered.fasta" + +Try to filter with sequences that don't match any of the metadata. +This should produce no results because the intersection of metadata and sequences is empty. + + $ echo -e ">something\nATCG" > "$TMP/dummy.fasta" + $ ${AUGUR} filter \ + > --sequences "$TMP/dummy.fasta" \ + > --metadata filter/metadata.tsv \ + > --max-date 2020-01-30 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + WARNING: A sequence index was not provided, so we are generating one. Generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + ERROR: All samples have been dropped! Check filter rules and metadata file format. + [1] + $ wc -l "$TMP/filtered_strains.txt" + \s*0 .* (re) + $ rm -f "$TMP/filtered_strains.txt" + +Repeat with sequence and strain outputs. We should get the same results. + + $ ${AUGUR} filter \ + > --sequences "$TMP/dummy.fasta" \ + > --metadata filter/metadata.tsv \ + > --max-date 2020-01-30 \ + > --output-strains "$TMP/filtered_strains.txt" \ + > --output-sequences "$TMP/filtered.fasta" > /dev/null + WARNING: A sequence index was not provided, so we are generating one. Generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + ERROR: All samples have been dropped! Check filter rules and metadata file format. + [1] + $ wc -l "$TMP/filtered_strains.txt" + \s*0 .* (re) + $ grep "^>" "$TMP/filtered.fasta" | wc -l + \s*0 (re) + $ rm -f "$TMP/filtered_strains.txt" + $ rm -f "$TMP/filtered.fasta" + +Filter TB strains from VCF and save as a list of filtered strains. + + $ ${AUGUR} filter \ + > --sequences filter/tb.vcf.gz \ + > --metadata filter/tb_metadata.tsv \ + > --min-date 2012 \ + > --min-length 10500 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + $ wc -l "$TMP/filtered_strains.txt" + \s*3 .* (re) + $ rm -f "$TMP/filtered_strains.txt" + +Confirm that filtering omits strains without metadata or sequences. +The input sequences are missing one strain that is in the metadata. +The metadata are missing one strain that has a sequence. +The list of strains to include has one strain with no metadata/sequence and one strain with information that would have been filtered by country. +The query initially filters 3 strains from Colombia, one of which is added back by the include. + + $ echo "NotReal" > "$TMP/include.txt" + $ echo "COL/FLR_00008/2015" >> "$TMP/include.txt" + $ ${AUGUR} filter \ + > --sequence-index filter/sequence_index.tsv \ + > --metadata filter/metadata.tsv \ + > --query "country != 'Colombia'" \ + > --include "$TMP/include.txt" \ + > --output-strains "$TMP/filtered_strains.txt" + 4 strains were dropped during filtering + \t1 had no sequence data (esc) + \t1 had no metadata (esc) + \t3 of these were filtered out by the query: (esc) + \t\t"country != 'Colombia'" (esc) + (esc) + \t1 strains were added back because they were requested by include files (esc) + \t1 strains from include files were not added because they lacked sequence or metadata (esc) + 8 strains passed all filters + + $ rm -f "$TMP/filtered_strains.txt" diff --git a/tests/functional/filter/metadata.tsv b/tests/functional/filter/metadata.tsv index 319d519b6..dc66a193c 100644 --- a/tests/functional/filter/metadata.tsv +++ b/tests/functional/filter/metadata.tsv @@ -1,5 +1,4 @@ strain virus accession date region country division city db segment authors url title journal paper_url -PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 Direct Submission Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323 COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ diff --git a/tests/functional/filter/sequence_index.tsv b/tests/functional/filter/sequence_index.tsv index 9ead2af42..16dafa0bd 100644 --- a/tests/functional/filter/sequence_index.tsv +++ b/tests/functional/filter/sequence_index.tsv @@ -10,4 +10,3 @@ DOM/2016/BB_0059 10035 2563 2089 2741 2015 621 6 0 0 0 BRA/2016/FC_6706 10366 2747 2203 2915 2165 329 7 0 0 0 DOM/2016/BB_0183 10621 2910 2343 3099 2269 0 0 0 0 0 EcEs062_16 10812 2960 2388 3158 2306 0 0 0 0 0 -HND/2016/HU_ME59 10365 2842 2271 3016 2233 0 3 0 0 0 diff --git a/tests/functional/filter/sequences.fasta b/tests/functional/filter/sequences.fasta index 3e37aa57f..8149494fc 100644 --- a/tests/functional/filter/sequences.fasta +++ b/tests/functional/filter/sequences.fasta @@ -1962,177 +1962,3 @@ ggcctgaactggagatcagctgtggatctccagaagagggactagtggttagaggagacc ccccggaaaacgcaaaacagcatattgacgctgggaaagaccagagactccatgagtttc caccacgctggccgccaggcacagatcgccgaatagcggcggccggtgtggggaaatcca tgggagatcgga ->HND/2016/HU_ME59 -gtttgaagcgaaagctagcaacagtatcaacaggttttattttggatttggaaacgagag -tttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgcta -aaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggactt -ctgctgggtcatgggcccatcaggatggtcttggcgattctagcctttttgagattcacg -gcaatcaagccatcactgggtctcatcaatagatggggttcagtggggaaaaaagaggct -atggaaataataaagaagttcaagaaagatctggctgccatgctgagaataatcaatgct -aggaaggagaagaagagacgaggcgcagatactagtgtcggaattgttggcctcctgctg -accacagctatggcagcggaggtcactagacgtgggagtgcatactatatgtacttggac -agaaacgatgctggggaggccatatcttttccaaccacattggggatgaataagtgttat -atacagatcatggatcttggacacatgtgtgatgccaccatgagctatgaatgccctatg -ctggatgagggggtggaaccagatgacgtcgattgttggtgcaacacgacgtcaacttgg -gttgtgtacggaacctgccatcacaaaaaaggtgaagcacggagatctagaagagctgtg -acgctcccctcccattccactaggaagctgcaaacgcggtcgcaaacctggttggaatca -agagaatacacaaagcacttgattagagtcgaaaattggatattcaggaaccctggcttc -gcgttagcagcagctgccatcgcttggcttttgggaagctcaacgagccaaaaagtcata -tacttggtcatgatactgctgattgccccggcatacagcatcaggtgcataggagtcagc -aatagggactttgtggaaggtatgtcaggtgggacttgggttgatgttgtcttggaacat -ggaggttgtgtcaccgtaatggcacaggacaaaccgactgtcgacatagagctggttaca -acaacagtcagcaacatggcggaggtaagatcctactgctatgaggcatcaatatcagac -atggcttcggacagccgctgcccaacacaaggtgaagcctaccttgacaagcaatcagac -actcaataygtctgcaaaagaacgttagtggacagaggctggggaaatggatgtggactt -tttggcaaagggagcctggtgacatgcgctaagtttgcatgctccaagaaaatgaccggg -aagagcatccagccagagaatctggagtaccggataatgctgtcagttcatggctcccag -cacagtgggatgatcgttaatgacacaggacatgaaactgatgagaatagagcgaaggtt -gagataacgcccawttcaccaagagccgaagccaccctggggggttttggaagcctagga -cttgattgtgaaccgaggacaggccttgacttttcagatttgtattacttgactatgaat -aacaagcactggttggttcacaaggagtggttccacgacattccattaccttggcacgct -ggggcagacaccggaactccacactggaacaacaaagaagcactggtagagttcaaggac -gcacatgccaaaaggcaaactgtcgtggttctagggagtcaagaaggagcagttcacacg -gcccttgctggagctctggaggctgagatggatggtgcaaagggaaggctgtcctctggc -cacttgaaatgtcgcctgaaaatggataaacttagattgaagggcgtgtcatactccttg -tgtaccgcagcgttcacattcaccaagatcccggctgaaacactgcacgggacagtcaca -gtggaggtacagtacgcagggacagatggaccttgcaaggttccagctcagatggcggtg -gacatgcaaactctgaccccagttgggaggttgataaccgctaaccccgtaatcactgaa -agcactgagaactctaagatgatgctggaacttgatccaccatttggggactcttacatt -gtcataggagtcggggagaagaagatcacccaccactggcacaggagtggcagcaccatt -ggaaaagcatttgaagccactgtgagaggtgccaagagaatggcagtcttgggagacaca -gcctgggactttggatcagttggaggcgctctcaactcattgggcaagggcatccatcaa -atttttggagcagctttcaaatcattgtttggaggaatgtcctggttctcacaaattctc -attggaacgttgctgatgtggttgggtctgaacacaaagaatggatctatttcccttatg -tgcttggccttagggggagtgttgatcttcttatccacagccgtctctgctgatgtgggg -tgctcggtggacttctcaaagaaggagacgagatgcggtacaggggtgttcgtctataac -gacgttgaagcctggagggacaggtacaagtaccatcctgactccccccgtagattggca -gcagcagtcaagcaagcctgggaagatggtatctgcgggatctcctctgtttcaagaatg -gaaaacatcatgtggagatcagtagaaggggagctcaacgcaatcctggaagagaatgga -gttcaactgacggtcgttgtgggatctgtaaaaaaccccatgtggagagctccacagaga -ttgcccgtgcctgtgaacgagctgccccacggctggaaggcttgggggaaatcgtacttc -gtcagagcagcaaagacaaataacagctttgtcgtggatggtgacacactgaaggaatgc -ccactcaaacatagagcatggaacagctttcttgtggaggatcatgggttcggggtattt -cacactagtgtctggctcaaggttagagaagattattcattagagtgtgatccagccgtt -attggaacagctgttaagggaaaggaggctgtacacagtgatctaggctactggattgag -agtgagaagaatgacacatggaggctgaagagggcccatctgatcgagatgaaaacatgt -gaatggccaaagtcccacacattgtggacagatggaatagaagagagtgatctgatcata -cccaagtctttagctgggccactcagccatcacaataccagagagggctacaggacccaa -atgaaagggccatggcacagtgaagagcttgaaattcggtttgaggaatgcccaggcact -aaggtccacgtggaggaaacatgtggaacaagaggaccatctctgagatcaaccactgca -agcggaagggtgatcgaggaatggtgctgcagggagtgcacaatgcccccactgtcgttc -cgggctaaagatggctgttggtatggaatggagataaggcccaggaaagaaccagaaagc -aacttagtaaggtcaatggtgactgcaggatcaactgatcacatggatcacttctccctt -ggagtgcttgtgattctgctcatggtgcaggaagggctaaagaagagaatgaccacaaag -atcatcataagcacatcaatggcagtgctggtagctatgatcctgggaggattttcaatg -agtgacctggctaagcttgcaattttgatgggtgccaccttcgcggaaatgaacactgga -ggagatgtagctcatctggcgctgatagcggcattcaaagtcagaccagcgttgctggta -tctttcatcttcagagctaattggacaccccgtgaaagcatgctactggccttggcctcg -tgtcttttgcaaactgcgatctccgccttggaaggcgacctgatggttctcatcaatggt -tttgctttggcctggttggcaatacgagcgatggttgttccacgcactgataacatcacc -ttggcaatcctggctgctctgacaccactggcccggggcacactgcttgtggcgtggaga -gcaggccttgctacttgcggggggtttatgctcctctctctgaagggaaaaggcagtgtg -aagaagaacttaccatttgtcatggccctgggactaaccgctgtgaggctggtcgacccc -atcaacgtggtgggactgctgttgctcacaaggagtgggaagcggagctggccccctagc -gaagtactcacagctgttggcctgatatgcgcattggctggagggttcgccaaggcagat -atagagatggctgggcccatggccgcggtcggtctgctaattgtcagttacgtggtctca -ggaaagagtgtggacatgtacattgaaagagcaggtgacatcacatgggaaaaagatgcg -gaagtcactggaaacagtccccggctcgatgtggcgctagatgagagtggtgatttctcc -ctggtggaggatgacggtccccccatgagagagatcatactcaaggtggtcctgatgacc -atctgtggcatgaacccaatagccataccctttgcagctggagcgtggtacgtatacgtg -aagactggaaaaaggagtggtgctctatgggatgtgcctgctcccaaggaagtaaaaaag -ggggagaccacagatggagtgtacagagtaatgactcgtagactgctaggttcaacacaa -gttggagtgggagtcatgcaagagggggtctttcacactatgtggcacgtcacaaaagga -tccgcactgagaagcggtgaagggagacttgatccatactggggagatgtcaagcaggat -ctggtgtcatactgtggtccatggaagctagatgccgcctgggacgggcacagcgaggtg -cagctcctggccgtgccccccggagagagagcgaggaacatccagactctgcccggaata -tttaagacaaaggatggggacattggagcggttgcgctggattacccagcaggaacttca -ggatctccaatcctagacaagtgtgggagagtgataggactttatggcaatggggtcgtg -atcaaaaatgggagttatgttagtgccatcacccaagggaggagggaggaagagactcct -gttgagtgcttcgagccttcgatgctgaagaagaagcagctaactgtcttagacttacat -cctggagctgggaaaaccaggagagttcttcctgaaatagtccgtgaagccataaaaaca -agactccgtactgtgatcttagctccaaccagggttgtcgctgctgaaatggaggaggcc -cttagagggcttccagtgcgttatatgacaacagcagtcaatgtcacccactctggaaca -gaaatcgtcgacttaatgtgccatgccaccttcacttcacgtctactacagccaatcaga -gtccccaactataatctgtatattatggatgaggcccacttcacagatccctcaagtata -gcagcaagaggatacatttcaacaagggttgagatgggcgaggcggctgccatcttcatg -accgccacgccaccaggaacccgtgacgcatttccggactccaactcaccaattatggac -accgaagtggaagtcccagagagagcctggagctcaggctttgattgggtgacggatcat -tctggaaaaacagtttggtttgttccaagcgtgaggaacggcaatgagatcgcagcttgt -ctgacaaaggctggaaaacgggtcatacagctcagcagaaagacttttgagacagagttc -cagaaaacaaaacatcaagagtgggactttgtcgtgacaactgacatttcagagatgggc -gccaactttaaagctgaccgtgtcatagattccaggagrtgcctaaagccggtcatactt -gatggcgagagagtcattctggctggacccatgcctgtcacacatgccagcgctgcccag -aggagggggcgcataggcaggaatcccaacaaacctggagatgagtatctgtatggaggt -gggtgcgcagagactgacgaagaccatgcacactggcttgaagcaagaatgctccttgac -aatatttacctccaagatggcctcatagcctcgctctatcgacctgaggccgacaaagta -gcagccattgagggagagttcaagcttaggacggagcaaaggaagacctttgtggaactc -atgaaaagaggagatcttcctgtttggctggcctatcaggttgcatctgccggaataacc -tacacagatagaagatggtgctttgatggcacgaccaacaacaccatactggaagacagt -gtgccggcagaggtgtggaccagacacggagagaaaagagtgctcaaaccgaggtggatg -gacgccagagtttgttcagatcatgcggccctgaagtcattcaaggagtttgccgctggg -aaaagaggagcggcttttggagtgatggaagccctgggaacactgccaggacacatgaca -gagagattccaggaagccattgacaacctcgctgtgctcatgcgggcagagactggaagc -aggccttacaaagccgcggcggcccaattgccggagaccctagagaccattatgcttttg -gggttgctgggaacagtctcgctgggaatctttttcgtcttgatgaggaacaagggcata -gggaagatgggctttggaatggtgacccttggggccagtgcatggctcatgtggctctcg -gaaattgagccagccagaattgcatgtgtcctcattgttgtgttcctattgctggtggtg -ctcatacctgagccagaaaagcaaagatctccccaggacaaccaaatggcaatcatcatc -atggtagcagtaggtcttctgggcttgattaccgccaatgaactcggatggttggagaga -acaaagagtgacctaagccatctgatgggaaggagagaggagggggcaaccataggattc -tcaatggacattgacctgcggccagcctcagcttgggccatctatgctgccttgacaact -ttcattaccccagccgtccaacatgcagtgaccacttcatacaacaactactccttaatg -gcgatggccacgcaagctggagtgttgtttggtatgggcaaagggatgccattctacgca -tgggactttggagtcccgctgctaatgataggttgctactcacaattaacacccctgacc -ctaatagtggccatcattttgctcgtggcgcactacatgtacttgatcccagggctgcag -gcagcagctgcgcgtgctgcccagaagagaacggcagctggcatcatgaagaaccctgtt -gtggatggaatagtggtgactgacattgacacaatgacaattgacccccaagtggagaaa -aagatgggacaggtgctactcatagcagtagccgtctccagcgccatactgtcgcggacc -gcctgggggtggggggaggctggggccctgatcacagccgcaacttccactttgtgggaa -ggctctccgaacaagtactggaactcctctacagccacttcactgtgtaacatttttagg -ggaagttacttggctggagcttctctaatctacacagtaacaagaaacgctggcttggtc -aagagacgtgggggtggaacaggagagaccctgggagagaaatggaaggcccgcttgaac -cagatgtcggccctggagttctactcctacaaaaagtcaggcatcaccgaggtgtgcaga -gaagaggcccgccgcgccctcaaggacggtgtggcaacgggaggccatgctgtgtcccga -ggaagtgcaaagctgagatggttggtggagcggggatacctgcagccctatggaaaggtc -attgatcttggatgtggcagagggggctggagttactacgccgccaccatccgcaaagtt -caagaagtgaaaggatacacaaaaggaggccctggtcatgaagaacccgtgttggtgcaa -agctatgggtggaacatagtccgtcttaagagtggggtggacgtctttcatatggcggct -gagccgtgtgacacgttgctgtgtgacataggtgagtcatcatctagtcctgaagtggaa -gaagcacggacgctcagagtcctctccatggtgggggattggcttgaaaaaagaccagga -gccttttgtataaaagtgttgtgcccatacaccagcactatgatggaaaccctggagcga -ctgcagcgtaggtatgggggaggactggtcagagtgccactctcccgcaactctacacat -gagatgtactgggtctctggagcgaaaagcaacaccataaaaagtgtgtccaccacgagc -cagctcctcttggggcgcatggacgggcctaggaggccagtgaaatatgaggaggatgtg -aatctcggctctggcacgcgggctgtggtaagctgcgctgaagctcccaacatgaagatc -attggtaaccgcattgaaaggatccgcagtgagcacgcggaaacgtggttctttgacgag -aaccacccatataggacatgggcttaccatggaagctatgaggcccccacacaagggtca -gcgtcctctctaataaacggggttgtcaggctcctgtcaaaaccctgggatgtggtgact -ggagtcacaggaatagccatgaccgacaccacaccgtatggtcagcaaagagttttcaag -gaaaaagtggacactagggtgccagacccccaagaaggcactcgtcaggttatgagcatg -gtctcttcctggttgtggaaagagctaggcaaacacaaacggccacgagtctgtaccaaa -gaagagttcatcaacaaggttcgtagcaatgcagcattaggggcaatatttgaagaggaa -aaagagtggaagactgcagtggaagctgtgaacgatccaaggttctgggctctagtggac -aaggaaagagagcaccacctgagaggagagtgccagagttgtgtgtacaacatgatggga -aaaagagaaaagaaacaaggggaatttggaaaggccaagggcagccgcgccatctggtat -atgtggctaggggctagatttctagagttcgaagcccttggattcttgaacgaggatcac -tggatggggagagagaactcaggaggtggtgttgaagggctgggattacaaagactcgga -tatgtcctagaagagatgagttgcataccaggaggaaggatgtatgcagatgacactgct -ggctgggacacccgcatcagcaggtttgatctggagaatgaagctctaatcaccaaccaa -atggagaaagggcacagggccttggcattggccataatcaagtacacataccaaaacaaa -gtggtaaaggtccttagaccagctgaaaaagggaaaacagttatggacattatttcgaga -caagaccaaagggggagcggacaagttgtcacttacgctcttaacacatttaccaaccta -gtggtgcaactcatccggaatatggaggctgaggaagttctagagatgcaagacttgtgg -ctgctgcggaggtcagagaaagtgaccaactggttgcagagcaacggatgggataggctc -aaacgaatggcagtcagtggagatgattgcgttgtgaagccaattgatgataggtttgca -catgccctcaggttcttgaatgatatgggaaaagttaggaaggacacacaagagtggaaa -ccctcaactggatgggacaactgggaagaagttccgttttgctcccaccacttcaacaag -ctccatctcaaggacgggaggtccattgtggttccctgccgccaccaagatgaactgatt -ggccgggcccgcgtctctccaggggcgggatggagcatccgggagactgcttgcctagca -aaatcatatgcgcaaatgtggcagctcctttatttccacagaagggacctccgactgatg -gccaatgccatttgttcatctgtgccagttgactgggttccaactgggagaactacctgg -tcaatccatggaaagggagaatggatgaccactgaagacatgcttgtggtgtggaacaga -gtgtggattgaggagaacgaccacatggaagacaagaccccagttacgaaatggacagac -attccctatttgggaaaaagggaagacttgtggtgtggatctctcatagggcacagaccg -cgcaccacctgggctgagaacattaaaaacacagtcaacatggtgcgcaggatcataggt -gatgaagaaaagtacatggactacctatccacccaagttcgctacttgggtgaagaaggg -tctacacctggagtgctgtaagcaccaatcttaatgttgtcaggc diff --git a/tests/functional/filter/tb.vcf.gz b/tests/functional/filter/tb.vcf.gz new file mode 100644 index 000000000..1eb0eeba4 Binary files /dev/null and b/tests/functional/filter/tb.vcf.gz differ diff --git a/tests/functional/filter/tb_metadata.tsv b/tests/functional/filter/tb_metadata.tsv new file mode 100644 index 000000000..20eb69dc8 --- /dev/null +++ b/tests/functional/filter/tb_metadata.tsv @@ -0,0 +1,166 @@ +strain accession date region country location db authors cluster paper_url title +G22670 10155 1991-XX-XX north_america canada village_k genbank Lee et al Mj-I http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22671 10223 1992-XX-XX north_america canada village_d genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22672 11011 1991-XX-XX north_america canada village_k genbank Lee et al Mj-I http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22673 11234 1992-XX-XX north_america canada village_k genbank Lee et al Mj-I http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22674 14069 1993-XX-XX north_america canada village_k genbank Lee et al Mj-I http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22675 14508 1993-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22676 15613 1994-XX-XX north_america canada village_e genbank Lee et al Mj-V.d http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22677 16490 1995-XX-XX north_america canada village_e genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22678 16493 1995-XX-XX north_america canada village_k genbank Lee et al Mj-I http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22679 18421 1996-XX-XX north_america canada village_k genbank Lee et al Mj-I http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22680 18422 1996-XX-XX north_america canada village_k genbank Lee et al Mj-I http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22681 18747 1996-XX-XX north_america canada village_k genbank Lee et al Mj-II http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22682 18988 1996-XX-XX north_america canada village_k genbank Lee et al Mj-II http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22683 19057 1996-XX-XX north_america canada village_k genbank Lee et al Mj-II http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22684 19276 1996-XX-XX north_america canada village_k genbank Lee et al Mj-II http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22685 50045 1996-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22686 50179 1997-XX-XX north_america canada village_k genbank Lee et al Mj-II http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22687 50248 1997-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22688 53221 1998-XX-XX north_america canada village_c genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22689 54902 1999-XX-XX north_america canada other_village genbank Lee et al Mj-IV.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22690 55546 2000-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22691 55753 2000-XX-XX north_america canada village_b genbank Lee et al Mj-VI http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22692 55988 2005-XX-XX north_america canada village_b genbank Lee et al Mj-VI http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22693 55989 2000-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22694 56828 2000-XX-XX north_america canada other_village genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22695 57052 2001-XX-XX north_america canada village_k genbank Lee et al Mj-II http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22696 58385 2001-XX-XX north_america canada other_village genbank Lee et al NA http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22697 60053 2002-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22698 62796 2004-XX-XX north_america canada village_e genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22699 62806 2004-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22700 62957 2004-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22701 63113 2004-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22702 63670 2004-XX-XX north_america canada village_k genbank Lee et al Mj-II http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22703 63878 2004-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22704 64165 2004-XX-XX north_america canada village_k genbank Lee et al Mj-II http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22705 64334 2005-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22706 64712 2005-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22707 65165 2005-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22708 66591 2006-XX-XX north_america canada village_c genbank Lee et al Mj-V.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22709 68995 2007-XX-XX north_america canada village_k genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22710 73787 2010-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22711 74856 2010-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22712 78932 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22713 79031 2012-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22669 9965 1991-XX-XX north_america canada village_d genbank Lee et al Mj-IV.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22589 MT-0080 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22668 MT-0712 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22582 MT-0718 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22587 MT-0972 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22590 MT-1103 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22591 MT-1128 2011-XX-XX north_america canada village_b genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22592 MT-1167 2008-XX-XX north_america canada village_b genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22593 MT-1206 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22666 MT-1212 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22594 MT-1247 2012-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22714 MT-13-1408 2013-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22715 MT-13-1711 2013-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22716 MT-13-1712 2013-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22717 MT-13-1753 2013-XX-XX north_america canada village_a genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22718 MT-13-1828 2013-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22719 MT-13-1835 2013-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22720 MT-13-1892 2013-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22721 MT-13-2012 2013-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22722 MT-13-2334 2013-XX-XX north_america canada village_a genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22723 MT-13-2384 2013-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22725 MT-13-2690 2013-XX-XX north_america canada village_a genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22726 MT-13-2761 2013-XX-XX north_america canada village_a genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22727 MT-13-3209 2013-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22728 MT-13-848 2013-XX-XX north_america canada village_a genbank Lee et al Mn http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22595 MT-131 2011-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22596 MT-1336 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22597 MT-1345 2011-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22598 MT-1393 2008-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22599 MT-140 2009-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22600 MT-1403 2009-XX-XX north_america canada village_e genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22601 MT-1466 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22602 MT-1499 2011-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22603 MT-1549 2008-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22604 MT-1605 2008-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22605 MT-1684 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22606 MT-1799 2011-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22607 MT-1838 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22608 MT-1971 2011-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22609 MT-2151 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22610 MT-2174 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22611 MT-2175 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22612 MT-2178 2011-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22613 MT-2184 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22614 MT-2224 2008-XX-XX north_america canada village_e genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22615 MT-2356 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22616 MT-2465 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22617 MT-2473 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22618 MT-2474 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22619 MT-2538 2011-XX-XX north_america canada village_c genbank Lee et al Mj-V.d http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22620 MT-2665 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22621 MT-2667 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22622 MT-2706 2010-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22623 MT-2720 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22624 MT-2762 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22625 MT-2768 2009-XX-XX north_america canada village_e genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22626 MT-2769 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22627 MT-2771 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22628 MT-2792 2006-XX-XX north_america canada village_c genbank Lee et al Mj-V.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22629 MT-2800 2010-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22631 MT-289 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22632 MT-2905 2006-XX-XX north_america canada village_c genbank Lee et al Mj-V.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22633 MT-2910 2006-XX-XX north_america canada village_c genbank Lee et al Mj-V.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22634 MT-2931 2006-XX-XX north_america canada village_c genbank Lee et al Mj-V.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22635 MT-3000 2009-XX-XX north_america canada village_c genbank Lee et al Mj-V.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22636 MT-3004 2009-XX-XX north_america canada village_c genbank Lee et al Mj-V.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22637 MT-3074 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22638 MT-3173 2009-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22639 MT-3194 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22640 MT-3239 2006-XX-XX north_america canada other_village genbank Lee et al Mj-V.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22641 MT-3255 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22642 MT-3271 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22643 MT-3281 2010-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22644 MT-3296 2011-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22645 MT-3341 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22648 MT-3673 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22649 MT-3683 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22651 MT-3787 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22652 MT-389 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22653 MT-393 2009-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22729 MT-398 2005-XX-XX north_america canada other_village genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22654 MT-405 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22655 MT-4067 2007-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22656 MT-4137 2010-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22667 MT-4166 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22730 MT-4230 2010-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22657 MT-441 2011-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22731 MT-4466 2011-XX-XX north_america canada village_d genbank Lee et al Mj-IV.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22658 MT-452 2011-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22659 MT-467 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22660 MT-4683 2011-XX-XX north_america canada village_d genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22661 MT-4846 2011-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22662 MT-4854 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22663 MT-4884 2011-XX-XX north_america canada village_c genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22664 MT-4942 2007-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22665 MT-504 2011-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22565 MT-5195 2007-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22566 MT-5337 2007-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22567 MT-5373 2009-XX-XX north_america canada other_village genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22568 MT-5383 2007-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22569 MT-5447 2011-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22732 MT-5488 2007-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22570 MT-5531 2011-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22571 MT-5543 2007-XX-XX north_america canada village_k genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22572 MT-567 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22733 MT-578 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22573 MT-5870 2009-XX-XX north_america canada other_village genbank Lee et al Mj-IV.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22574 MT-5983 2011-XX-XX north_america canada village_k genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22575 MT-6084 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22576 MT-6205 2011-XX-XX north_america canada village_a genbank Lee et al Mj-V.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22577 MT-6218 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22578 MT-6226 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22579 MT-6429 2012-XX-XX north_america canada village_k genbank Lee et al Mj-III.b http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22580 MT-661 2009-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22581 MT-692 2009-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22583 MT-721 2008-XX-XX north_america canada village_a genbank Lee et al Mj-III.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22584 MT-751 2008-XX-XX north_america canada village_b genbank Lee et al Mj-V.a http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22585 MT-853 2011-XX-XX north_america canada village_c genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22586 MT-877 2010-XX-XX north_america canada other_village genbank Lee et al Mj-IV.c http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22650 None1 20XX-XX-XX north_america canada ? genbank Lee et al http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit +G22724 None2 20XX-XX-XX north_america canada ? genbank Lee et al http://www.pnas.org/content/112/44/13609 Population Genomics of Mycobacterium tuberculosis in the Inuit diff --git a/tests/test_utils.py b/tests/test_utils.py index 264bcc643..a7d439804 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,5 @@ import datetime +from pathlib import Path from unittest.mock import patch import pytest @@ -129,3 +130,18 @@ def test_is_date_ambiguous(self): # Test incomplete date strings without ambiguous dates for the requested fields. assert not utils.is_date_ambiguous("2019", "year") assert not utils.is_date_ambiguous("2019-10", "month") + + def test_read_strains(self, tmpdir): + # Write one list of filenames with some unnecessary whitespace. + strains1 = Path(tmpdir) / Path("strains1.txt") + with open(strains1, "w") as oh: + oh.write("strain1 # this is an inline comment about strain 1\nstrain2\n # this is a comment preceded by whitespace.\n") + + # Write another list of filenames with a comment. + strains2 = Path(tmpdir) / Path("strains2.txt") + with open(strains2, "w") as oh: + oh.write("# this is a comment. ignore this.\nstrain2\nstrain3\n") + + strains = utils.read_strains(strains1, strains2) + assert len(strains) == 3 + assert "strain1" in strains