diff --git a/CHANGES.md b/CHANGES.md index 79f52910e..4028bd9a3 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,12 +4,14 @@ ### Major Changes -* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this supported any arbitrary delimiters for the metadata. It is now restricted to CSV and TSV, which are the officially supported formats for all Augur subcommands that use this function. [#812][] (@victorlin) +* export, filter, frequencies, refine, traits: From versions 10.0.0 through 21.1.0, arbitrary delimiters for `--metadata` were supported due to internal implementation differences from the advertised CSV and TSV support. Starting with this version, non-CSV/TSV files will no longer be supported by default. To adjust for this breaking change, specify custom delimiters with the new `--metadata-delimiters` flag. [#1196][] (@victorlin) +* `augur.io.read_metadata`: Previously, this supported any arbitrary delimiters for the metadata. It now requires a new argument, `valid_delimiters`. [#812][] (@victorlin) ### Features * Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer) * `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin) +* curate: Allow custom metadata delimiters with the new `--metadata-delimiters` flag. [#1196][] (@victorlin) ### Bug fixes @@ -21,6 +23,7 @@ [#1152]: https://github.com/nextstrain/augur/pull/1152 [#1171]: https://github.com/nextstrain/augur/issues/1171 [#1178]: https://github.com/nextstrain/augur/pull/1178 +[#1196]: https://github.com/nextstrain/augur/pull/1196 [`pandas.read_csv()`]: https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.read_csv.html ## 21.1.0 (14 March 2023) diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index ee0938cc2..257ce2ff5 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -9,7 +9,7 @@ from augur.argparse_ import add_command_subparsers from augur.errors import AugurError from augur.io.json import dump_ndjson, load_ndjson -from augur.io.metadata import read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv +from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.io.sequences import write_records_to_fasta from augur.types import DataErrorMethod from . import normalize_strings, passthru @@ -46,11 +46,13 @@ def create_shared_parser(): If no input options are provided, commands will try to read NDJSON records from stdin. """) shared_inputs.add_argument("--metadata", - help="Input metadata file, as CSV or TSV. Accepts '-' to read metadata from stdin.") + help="Input metadata file. Accepts '-' to read metadata from stdin.") shared_inputs.add_argument("--id-column", help="Name of the metadata column that contains the record identifier for reporting duplicate records. " "Uses the first column of the metadata file if not provided. " "Ignored if also providing a FASTA file input.") + shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", + help="Delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") shared_inputs.add_argument("--fasta", help="Plain or gzipped FASTA file. Headers can only contain the sequence id used to match a metadata record. " + @@ -133,15 +135,30 @@ def run(args): args.metadata = sys.stdin if args.metadata and args.fasta: - records = read_metadata_with_sequences( - args.metadata, - args.fasta, - args.seq_id_column, - args.seq_field, - DataErrorMethod(args.unmatched_reporting), - DataErrorMethod(args.duplicate_reporting)) + try: + records = read_metadata_with_sequences( + args.metadata, + args.metadata_delimiters, + args.fasta, + args.seq_id_column, + args.seq_field, + DataErrorMethod(args.unmatched_reporting), + DataErrorMethod(args.duplicate_reporting)) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) elif args.metadata: - records = read_table_to_dict(args.metadata, DataErrorMethod(args.duplicate_reporting), args.id_column) + try: + records = read_table_to_dict(args.metadata, args.metadata_delimiters, DataErrorMethod(args.duplicate_reporting), args.id_column) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) elif not sys.stdin.isatty(): records = load_ndjson(sys.stdin) else: diff --git a/augur/export_v1.py b/augur/export_v1.py index c6e2de9e6..5b7d7f506 100644 --- a/augur/export_v1.py +++ b/augur/export_v1.py @@ -9,8 +9,9 @@ from Bio import Phylo from argparse import SUPPRESS from collections import defaultdict +from .errors import AugurError from .argparse_ import ExtendAction -from .io.metadata import read_metadata +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors def convert_tree_to_json_structure(node, metadata, div=0, strains=None): @@ -311,7 +312,9 @@ def get_root_sequence(root_node, ref=None, translations=None): def add_core_args(parser): core = parser.add_argument_group("REQUIRED") core.add_argument('--tree','-t', required=True, help="tree to perform trait reconstruction on") - core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV") + core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata") + core.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") core.add_argument('--node-data', required=True, nargs='+', action=ExtendAction, help="JSON files with meta data for each node") core.add_argument('--output-tree', help="JSON file name that is passed on to auspice (e.g., zika_tree.json).") core.add_argument('--output-meta', help="JSON file name that is passed on to auspice (e.g., zika_meta.json).") @@ -364,7 +367,14 @@ def run(args): meta_json = read_config(args.auspice_config) ensure_config_is_v1(meta_json) - meta_tsv = read_metadata(args.metadata) + try: + meta_tsv = read_metadata(args.metadata, args.metadata_delimiters) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) add_tsv_metadata_to_nodes(nodes, meta_tsv, meta_json) tree_layout(T) diff --git a/augur/export_v2.py b/augur/export_v2.py index 3f606c797..68398ed56 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -11,7 +11,8 @@ from Bio import Phylo from .argparse_ import ExtendAction -from .io.metadata import read_metadata +from .errors import AugurError +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .types import ValidationMode from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors from .validate import export_v2 as validate_v2, auspice_config_v2 as validate_auspice_config_v2, ValidateError @@ -867,7 +868,9 @@ def register_parser(parent_subparsers): optional_inputs = parser.add_argument_group( title="OPTIONAL INPUT FILES" ) - optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree, as CSV or TSV") + optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree") + optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`") optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)") @@ -1039,13 +1042,19 @@ def run(args): if args.metadata is not None: try: - metadata_file = read_metadata(args.metadata).to_dict(orient="index") + metadata_file = read_metadata(args.metadata, args.metadata_delimiters).to_dict(orient="index") for strain in metadata_file.keys(): if "strain" not in metadata_file[strain]: metadata_file[strain]["strain"] = strain except FileNotFoundError: print(f"ERROR: meta data file ({args.metadata}) does not exist", file=sys.stderr) sys.exit(2) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) except Exception as error: print(f"ERROR: {error}", file=sys.stderr) sys.exit(1) diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index fe1b70b02..24ef547d3 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -2,7 +2,7 @@ Filter and subsample a sequence set. """ from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT -from augur.io.metadata import VALID_ID_COLUMNS +from augur.io.metadata import DEFAULT_DELIMITERS, VALID_ID_COLUMNS from augur.types import EmptyOutputReportingMethod from . import constants @@ -14,11 +14,12 @@ def register_arguments(parser): unit tests that use this function to create argparser. """ input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered") - input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV") + input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata") input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format") input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.") input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.") - input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'") + input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") + input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata") metadata_filter_group.add_argument( diff --git a/augur/filter/_run.py b/augur/filter/_run.py index aff4a3b69..03393dded 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -15,7 +15,7 @@ DELIMITER as SEQUENCE_INDEX_DELIMITER, ) from augur.io.file import open_file -from augur.io.metadata import read_metadata +from augur.io.metadata import InvalidDelimiter, read_metadata from augur.io.sequences import read_sequences, write_sequences from augur.io.print import print_err from augur.io.vcf import is_vcf as filename_is_vcf, write_vcf @@ -163,11 +163,19 @@ def run(args): all_sequences_to_include = set() filter_counts = defaultdict(int) - metadata_reader = read_metadata( - args.metadata, - id_columns=args.metadata_id_columns, - chunk_size=args.metadata_chunk_size, - ) + try: + metadata_reader = read_metadata( + args.metadata, + args.metadata_delimiters, + id_columns=args.metadata_id_columns, + chunk_size=args.metadata_chunk_size, + ) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) for metadata in metadata_reader: duplicate_strains = ( set(metadata.index[metadata.index.duplicated()]) | @@ -309,6 +317,7 @@ def run(args): # have passed filters. metadata_reader = read_metadata( args.metadata, + args.metadata_delimiters, id_columns=args.metadata_id_columns, chunk_size=args.metadata_chunk_size, ) diff --git a/augur/frequencies.py b/augur/frequencies.py index 6dd040268..ced5211de 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -7,10 +7,11 @@ from Bio import Phylo, AlignIO from Bio.Align import MultipleSeqAlignment +from .errors import AugurError from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates -from .io.metadata import read_metadata +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .utils import read_node_data, write_json @@ -20,7 +21,9 @@ def register_parser(parent_subparsers): parser.add_argument('--method', choices=["diffusion", "kde"], required=True, help="method by which frequencies should be estimated") parser.add_argument('--metadata', type=str, required=True, metavar="FILE", - help="metadata including dates for given samples, as CSV or TSV") + help="metadata including dates for given samples") + parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") parser.add_argument('--regions', type=str, nargs='+', default=['global'], help="region to subsample to") parser.add_argument("--pivot-interval", type=int, default=3, @@ -80,7 +83,14 @@ def format_frequencies(freq): def run(args): - metadata = read_metadata(args.metadata) + try: + metadata = read_metadata(args.metadata, args.metadata_delimiters) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) dates = get_numerical_dates(metadata, fmt='%Y-%m-%d') stiffness = args.stiffness inertia = args.inertia diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 5219cb75e..414224dc9 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -1,5 +1,6 @@ import csv import os +from typing import Iterable import pandas as pd import pyfastx import sys @@ -12,24 +13,31 @@ from .file import open_file -# Accept the following delimiters when reading a metadata file. -VALID_DELIMITERS = (',', '\t') +DEFAULT_DELIMITERS = (',', '\t') # Accept the following column names to represent a unique ID per row, in order # of preference. VALID_ID_COLUMNS = ("strain", "name") -def read_metadata(metadata_file, id_columns=VALID_ID_COLUMNS, chunk_size=None): - """Read metadata from a given filename and into a pandas `DataFrame` or +class InvalidDelimiter(Exception): + pass + + +def read_metadata(metadata_file, delimiters, id_columns=VALID_ID_COLUMNS, chunk_size=None): + r"""Read metadata from a given filename and into a pandas `DataFrame` or `TextFileReader` object. Parameters ---------- metadata_file : str Path to a metadata file to load. + delimiters : list of str + List of possible delimiters to check for between columns in the metadata. + Only one delimiter will be inferred. id_columns : list of str List of possible id column names to check for, ordered by priority. + Only one id column will be inferred. chunk_size : int Size of chunks to stream from disk with an iterator instead of loading the entire input file into memory. @@ -47,19 +55,19 @@ def read_metadata(metadata_file, id_columns=VALID_ID_COLUMNS, chunk_size=None): For standard use, request a metadata file and get a pandas DataFrame. - >>> read_metadata("tests/functional/filter/data/metadata.tsv").index.values[0] + >>> read_metadata("tests/functional/filter/data/metadata.tsv", ("\t",)).index.values[0] 'COL/FLR_00024/2015' Requesting an index column that doesn't exist should produce an error. - >>> read_metadata("tests/functional/filter/data/metadata.tsv", id_columns=("Virus name",)) + >>> read_metadata("tests/functional/filter/data/metadata.tsv", ("\t",), id_columns=("Virus name",)) Traceback (most recent call last): ... Exception: None of the possible id columns (('Virus name',)) were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url') We also allow iterating through metadata in fixed chunk sizes. - >>> for chunk in read_metadata("tests/functional/filter/data/metadata.tsv", chunk_size=5): + >>> for chunk in read_metadata("tests/functional/filter/data/metadata.tsv", ("\t",), chunk_size=5): ... print(chunk.shape) ... (5, 14) @@ -68,7 +76,7 @@ def read_metadata(metadata_file, id_columns=VALID_ID_COLUMNS, chunk_size=None): """ kwargs = { - "sep": _get_delimiter(metadata_file), + "sep": _get_delimiter(metadata_file, delimiters), "engine": "c", "skipinitialspace": True, "na_filter": False, @@ -111,7 +119,7 @@ def read_metadata(metadata_file, id_columns=VALID_ID_COLUMNS, chunk_size=None): ) -def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, id_column=None): +def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ERROR_FIRST, id_column=None): """ Read rows from *table* file and yield each row as a single dict. @@ -123,6 +131,10 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i table: str Path to a CSV or TSV file or IO buffer + delimiters : list of str + List of possible delimiters to check for between columns in the metadata. + Only one delimiter will be inferred. + duplicate_reporting: DataErrorMethod, optional How should duplicate records be reported @@ -158,12 +170,11 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i try: # Note: this sort of duplicates _get_delimiter(), but it's easier if # this is separate since it handles non-seekable buffers. - dialect = csv.Sniffer().sniff(table_sample, VALID_DELIMITERS) - except csv.Error as err: - raise AugurError( - f"Could not determine the delimiter of {table!r}. " - "File must be a CSV or TSV." - ) from err + dialect = csv.Sniffer().sniff(table_sample, delimiters) + except csv.Error as error: + # This assumes all csv.Errors imply a delimiter issue. That might + # change in a future Python version. + raise InvalidDelimiter from error metadata_reader = csv.DictReader(handle, dialect=dialect) if duplicate_reporting is DataErrorMethod.SILENT: @@ -205,7 +216,7 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i raise ValueError(f"Encountered unhandled duplicate reporting method: {duplicate_reporting!r}") -def read_metadata_with_sequences(metadata, fasta, seq_id_column, seq_field='sequence', +def read_metadata_with_sequences(metadata, metadata_delimiters, fasta, seq_id_column, seq_field='sequence', unmatched_reporting=DataErrorMethod.ERROR_FIRST, duplicate_reporting=DataErrorMethod.ERROR_FIRST): """ Read rows from *metadata* file and yield each row as a single dict that has @@ -235,6 +246,9 @@ def read_metadata_with_sequences(metadata, fasta, seq_id_column, seq_field='sequ metadata: str Path to a CSV or TSV metadata file + metadata_delimiters : list of str + List of possible delimiters to check for between columns in the metadata. + fasta: str Path to a plain or gzipped FASTA file @@ -297,7 +311,7 @@ def read_metadata_with_sequences(metadata, fasta, seq_id_column, seq_field='sequ # Silencing duplicate reporting here because we will need to handle duplicates # in both the metadata and FASTA files after processing all the records here. - for record in read_table_to_dict(metadata, duplicate_reporting=DataErrorMethod.SILENT): + for record in read_table_to_dict(metadata, metadata_delimiters, duplicate_reporting=DataErrorMethod.SILENT): seq_id = record.get(seq_id_column) if seq_id is None: @@ -437,14 +451,13 @@ def write_records_to_tsv(records, output_file): tsv_writer.writerow(record) -def _get_delimiter(path: str): - """Get the delimiter of a file.""" +def _get_delimiter(path: str, valid_delimiters: Iterable[str]): + """Get the delimiter of a file given a list of valid delimiters.""" with open_file(path) as file: try: # Infer the delimiter from the first line. - return csv.Sniffer().sniff(file.readline(), VALID_DELIMITERS).delimiter - except csv.Error as err: - raise AugurError( - f"Could not determine the delimiter of {path!r}. " - "File must be a CSV or TSV." - ) from err + return csv.Sniffer().sniff(file.readline(), valid_delimiters).delimiter + except csv.Error as error: + # This assumes all csv.Errors imply a delimiter issue. That might + # change in a future Python version. + raise InvalidDelimiter from error diff --git a/augur/refine.py b/augur/refine.py index d27916b5b..30dab6ee4 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -6,7 +6,7 @@ from Bio import Phylo from .dates import get_numerical_dates from .dates.errors import InvalidYearBounds -from .io.metadata import read_metadata +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .utils import read_tree, write_json, InvalidTreeError from .errors import AugurError from treetime.vcf_utils import read_vcf @@ -98,7 +98,9 @@ def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("refine", help=__doc__) parser.add_argument('--alignment', '-a', help="alignment in fasta or VCF format") parser.add_argument('--tree', '-t', required=True, help="prebuilt Newick") - parser.add_argument('--metadata', type=str, metavar="FILE", help="sequence metadata, as CSV or TSV") + parser.add_argument('--metadata', type=str, metavar="FILE", help="sequence metadata") + parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") parser.add_argument('--output-tree', type=str, help='file name to write tree to') parser.add_argument('--output-node-data', type=str, help='file name to write branch lengths as node data') parser.add_argument('--use-fft', action="store_true", help="produce timetree using FFT for convolutions") @@ -204,7 +206,14 @@ def run(args): if args.metadata is None: print("ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr) return 1 - metadata = read_metadata(args.metadata) + try: + metadata = read_metadata(args.metadata, args.metadata_delimiters) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) try: dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) diff --git a/augur/traits.py b/augur/traits.py index c9f812ff4..5dfe4dc39 100644 --- a/augur/traits.py +++ b/augur/traits.py @@ -6,7 +6,8 @@ from collections import defaultdict import os, sys import pandas as pd -from .io.metadata import read_metadata +from .errors import AugurError +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .utils import write_json, get_json_name TINY = 1e-12 @@ -100,7 +101,9 @@ def mugration_inference(tree=None, seq_meta=None, field='country', confidence=Tr def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("traits", help=__doc__) parser.add_argument('--tree', '-t', required=True, help="tree to perform trait reconstruction on") - parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata, as CSV or TSV") + parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata") + parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") parser.add_argument('--weights', required=False, help="tsv/csv table with equilibrium probabilities of discrete states") parser.add_argument('--columns', required=True, nargs='+', help='metadata fields to perform discrete reconstruction on') @@ -126,7 +129,14 @@ def run(args): command line arguments are parsed by argparse """ tree_fname = args.tree - traits = read_metadata(args.metadata) + try: + traits = read_metadata(args.metadata, args.metadata_delimiters) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) from Bio import Phylo T = Phylo.read(tree_fname, 'newick') diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t index ffce8207b..971a6c5e6 100644 --- a/tests/functional/filter/cram/filter-metadata-delimiter.t +++ b/tests/functional/filter/cram/filter-metadata-delimiter.t @@ -2,7 +2,7 @@ Setup $ source "$TESTDIR"/_setup.sh -Comma-delimited metadata is allowed. However, the output metadata will be tab-delimited. +Comma-delimited metadata is allowed by default. However, the output metadata will be tab-delimited. $ cat >metadata.txt <<~~ > strain,column @@ -18,7 +18,7 @@ Comma-delimited metadata is allowed. However, the output metadata will be tab-de strain\tcolumn (esc) SEQ_2\tB (esc) -Colon-delimited metadata is not allowed. +Colon-delimited metadata is not allowed by default. $ cat >metadata.txt <<~~ > strain:column @@ -30,5 +30,39 @@ Colon-delimited metadata is not allowed. > --metadata metadata.txt \ > --exclude-where column=A \ > --output-metadata filtered.txt > /dev/null - ERROR: Could not determine the delimiter of 'metadata.txt'. File must be a CSV or TSV. + ERROR: Could not determine the delimiter of 'metadata.txt'. Valid delimiters are: (',', '\t'). This can be changed with --metadata-delimiters. [2] + +Pass the default valid delimiters explicitly in reverse order. +Note: this shows how to specify a tab character in the list, though it shouldn't be necessary for most users. + + $ cat >metadata.txt <<~~ + > strain:column + > SEQ_1:A + > SEQ_2:B + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.txt \ + > --metadata-delimiters $'\t' ',' \ + > --exclude-where column=A \ + > --output-metadata filtered.txt > /dev/null + ERROR: Could not determine the delimiter of 'metadata.txt'. Valid delimiters are: ['\t', ',']. This can be changed with --metadata-delimiters. + [2] + +Allow colon-delimited metadata. However, the output metadata will be tab-delimited. + + $ cat >metadata.txt <<~~ + > strain:column + > SEQ_1:A + > SEQ_2:B + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.txt \ + > --metadata-delimiters ':' \ + > --exclude-where column=A \ + > --output-metadata filtered.txt > /dev/null + $ cat filtered.txt + strain\tcolumn (esc) + SEQ_2\tB (esc) diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 42aae001a..9d77067fc 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -4,7 +4,7 @@ from io import StringIO from augur.errors import AugurError -from augur.io.metadata import read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv +from augur.io.metadata import InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.types import DataErrorMethod @@ -39,13 +39,13 @@ def test_read_table_to_dict_with_csv(self, tmpdir, expected_record): fh.write('strain,date,country\n') fh.write('SEQ_A,2020-10-03,USA\n') - record = next(read_table_to_dict(path)) + record = next(read_table_to_dict(path, (','))) assert record == expected_record def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record): stdin = StringIO('strain,date,country\nSEQ_A,2020-10-03,USA\n') mp_context.setattr('sys.stdin', stdin) - record = next(read_table_to_dict(sys.stdin)) + record = next(read_table_to_dict(sys.stdin, (','))) assert record == expected_record def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): @@ -54,13 +54,13 @@ def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): fh.write('strain\tdate\tcountry\n') fh.write('SEQ_A\t2020-10-03\tUSA\n') - record = next(read_table_to_dict(path)) + record = next(read_table_to_dict(path, ('\t'))) assert record == expected_record def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record): stdin = StringIO('strain\tdate\tcountry\nSEQ_A\t2020-10-03\tUSA\n') mp_context.setattr('sys.stdin', stdin) - record = next(read_table_to_dict(sys.stdin)) + record = next(read_table_to_dict(sys.stdin, ('\t'))) assert record == expected_record def test_read_table_to_dict_with_bad_delimiter(self, tmpdir): @@ -69,26 +69,24 @@ def test_read_table_to_dict_with_bad_delimiter(self, tmpdir): fh.write('strain date country\n') fh.write('SEQ_A 2020-10-03 USA\n') - with pytest.raises(AugurError) as e_info: - next(read_table_to_dict(path)) - - assert str(e_info.value) == f"Could not determine the delimiter of {path!r}. File must be a CSV or TSV." + with pytest.raises(InvalidDelimiter): + next(read_table_to_dict(path, (',', '\t'))) @pytest.mark.parametrize('id_column', ['strain', None]) def test_read_table_to_dict_with_duplicates(self, metadata_with_duplicate, id_column): with pytest.raises(AugurError) as e_info: - list(read_table_to_dict(metadata_with_duplicate, id_column=id_column)) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), id_column=id_column)) assert str(e_info.value) == f"Encountered record with duplicate id 'SEQ_A' in {metadata_with_duplicate!r}" @pytest.mark.parametrize('id_column', ['strain', None]) def test_read_table_to_dict_with_duplicates_error_all(self, metadata_with_duplicate, id_column): with pytest.raises(AugurError) as e_info: - list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod("error_all"), id_column=id_column)) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), DataErrorMethod("error_all"), id_column=id_column)) assert str(e_info.value) == f"The following records are duplicated in {metadata_with_duplicate!r}:\n'SEQ_A'\n'SEQ_B'" @pytest.mark.parametrize('id_column', ['strain', None]) def test_read_table_to_dict_with_duplicates_warning(self, capsys, metadata_with_duplicate, id_column): - list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod('warn'), id_column=id_column)) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), DataErrorMethod('warn'), id_column=id_column)) captured = capsys.readouterr() assert captured.err == ( f"WARNING: Encountered record with duplicate id 'SEQ_A' in {metadata_with_duplicate!r}\n" @@ -97,13 +95,13 @@ def test_read_table_to_dict_with_duplicates_warning(self, capsys, metadata_with_ ) def test_read_table_to_dict_with_duplicates_silent(self, capsys, metadata_with_duplicate): - list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod('silent'))) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), DataErrorMethod('silent'))) assert "WARNING" not in capsys.readouterr().err def test_read_table_to_dict_with_duplicate_and_bad_id(self, metadata_with_duplicate): id_column = "bad_id" with pytest.raises(AugurError) as e_info: - list(read_table_to_dict(metadata_with_duplicate, id_column=id_column)) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), id_column=id_column)) assert str(e_info.value) == f"The provided id column {id_column!r} does not exist in {metadata_with_duplicate!r}." @@ -194,7 +192,7 @@ def metadata_with_unmatched_and_dup(tmpdir, metadata_file): class TestReadMetadataWithSequence: def test_read_metadata_with_sequence(self, metadata_file, fasta_file): - records = list(read_metadata_with_sequences(metadata_file, fasta_file, 'strain')) + records = list(read_metadata_with_sequences(metadata_file, ('\t',), fasta_file, 'strain')) assert len(records) == 4 for record in records: seq_base = record['strain'].split("_")[-1].upper() @@ -204,18 +202,19 @@ def test_read_metadata_with_sequence(self, metadata_file, fasta_file): def test_read_metadata_with_sequences_with_bad_id(self, metadata_file, fasta_file): id_field = "bad_id" with pytest.raises(AugurError) as e_info: - next(read_metadata_with_sequences(metadata_file, fasta_file, id_field)) + next(read_metadata_with_sequences(metadata_file, ('\t',), fasta_file, id_field)) assert str(e_info.value) == f"The provided sequence id column {id_field!r} does not exist in the metadata." def test_read_metadata_with_sequences_with_unmatched(self, metadata_with_unmatched, fasta_with_unmatched): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_with_unmatched, fasta_with_unmatched, 'strain')) + list(read_metadata_with_sequences(metadata_with_unmatched, ('\t',), fasta_with_unmatched, 'strain')) assert str(e_info.value) == "Encountered metadata record 'EXTRA_METADATA_A' without a matching sequence." def test_read_metadata_with_sequences_with_unmatched_error_all(self, metadata_with_unmatched, fasta_with_unmatched): with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_unmatched, + ('\t',), fasta_with_unmatched, 'strain', unmatched_reporting=DataErrorMethod.ERROR_ALL)) @@ -231,6 +230,7 @@ def test_read_metadata_with_sequences_with_unmatched_error_all(self, metadata_wi def test_read_metadata_with_sequences_with_unmatched_warning(self, capsys, metadata_with_unmatched, fasta_with_unmatched): records = list(read_metadata_with_sequences( metadata_with_unmatched, + ('\t',), fasta_with_unmatched, 'strain', unmatched_reporting=DataErrorMethod.WARN)) @@ -251,6 +251,7 @@ def test_read_metadata_with_sequences_with_unmatched_warning(self, capsys, metad def test_read_metadata_with_sequences_with_unmatched_silent(self, capsys, metadata_with_unmatched, fasta_with_unmatched): records = list(read_metadata_with_sequences( metadata_with_unmatched, + ('\t',), fasta_with_unmatched, 'strain', unmatched_reporting=DataErrorMethod.SILENT)) @@ -260,17 +261,17 @@ def test_read_metadata_with_sequences_with_unmatched_silent(self, capsys, metada def test_read_metadata_with_sequences_with_dup_metadata(self, metadata_with_dup, fasta_file): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_with_dup, fasta_file, 'strain')) + list(read_metadata_with_sequences(metadata_with_dup, ('\t',), fasta_file, 'strain')) assert str(e_info.value) == "Encountered metadata record with duplicate id 'SEQ_C'." def test_read_metadata_with_sequences_with_dup_fasta(self, metadata_file, fasta_with_dup): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_file, fasta_with_dup, 'strain')) + list(read_metadata_with_sequences(metadata_file, ('\t',), fasta_with_dup, 'strain')) assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'." def test_read_metadata_with_sequences_with_dup_both(self, metadata_with_dup, fasta_with_dup): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_with_dup, fasta_with_dup, 'strain')) + list(read_metadata_with_sequences(metadata_with_dup, ('\t',), fasta_with_dup, 'strain')) # Expected to error on first duplicate sequence since we check sequences first assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'." @@ -278,6 +279,7 @@ def test_read_metadata_with_sequences_with_dup_error_all(self, metadata_with_dup with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_dup, + ('\t',), fasta_with_dup, 'strain', duplicate_reporting=DataErrorMethod.ERROR_ALL @@ -294,6 +296,7 @@ def test_read_metadata_with_sequences_with_dup_error_all(self, metadata_with_dup def test_read_metadata_with_sequences_with_dup_warn(self, capsys, metadata_with_dup, fasta_with_dup): records = list(read_metadata_with_sequences( metadata_with_dup, + ('\t',), fasta_with_dup, 'strain', duplicate_reporting=DataErrorMethod.WARN @@ -317,6 +320,7 @@ def test_read_metadata_with_sequences_with_dup_warn(self, capsys, metadata_with_ def test_read_metadata_with_sequences_with_dup_silent(self, capsys, metadata_with_dup, fasta_with_dup): records = list(read_metadata_with_sequences( metadata_with_dup, + ('\t',), fasta_with_dup, 'strain', duplicate_reporting=DataErrorMethod.SILENT @@ -327,7 +331,7 @@ def test_read_metadata_with_sequences_with_dup_silent(self, capsys, metadata_wit def test_read_metadata_with_sequences_with_extra_and_dup(self, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup, 'strain')) + list(read_metadata_with_sequences(metadata_with_unmatched_and_dup, ('\t',), fasta_with_unmatched_and_dup, 'strain')) # Expected to error on first duplicate sequence since we check duplicate sequences first assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'." @@ -335,6 +339,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_error_all(self, metadat with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_unmatched_and_dup, + ('\t',), fasta_with_unmatched_and_dup, 'strain', unmatched_reporting=DataErrorMethod.ERROR_ALL, @@ -358,6 +363,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_unmatched(self, ca with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_unmatched_and_dup, + ('\t',), fasta_with_unmatched_and_dup, 'strain', unmatched_reporting=DataErrorMethod.WARN, @@ -387,6 +393,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_dups(self, capsys, with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_unmatched_and_dup, + ('\t',), fasta_with_unmatched_and_dup, 'strain', unmatched_reporting=DataErrorMethod.ERROR_ALL, @@ -418,6 +425,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_dups(self, capsys, def test_read_metadata_with_sequences_with_extra_and_dup_warn_both(self, capsys, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup): records = list(read_metadata_with_sequences( metadata_with_unmatched_and_dup, + ('\t',), fasta_with_unmatched_and_dup, 'strain', unmatched_reporting=DataErrorMethod.WARN,