From 55f0d8733530d88b14fe6f4a9b9e47d2fc5de47e Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 6 Apr 2023 13:12:50 -0700 Subject: [PATCH 1/5] Allow customization of input metadata delimiter Previously, there was hardcoded support for `,` and `\t`. Remove the hardcoding by: 1. Making valid delimiters a required parameter to `read_metadata()`. 2. In subcommands, adding a new flag `--metadata-valid-delimiters` with a default of the previously hardcoded delimiters, and passing that list into `read_metadata()`. --- augur/curate/__init__.py | 37 ++++++++---- augur/export_v1.py | 16 ++++- augur/export_v2.py | 15 ++++- augur/filter/__init__.py | 5 +- augur/filter/_run.py | 21 +++++-- augur/frequencies.py | 16 ++++- augur/io/metadata.py | 60 +++++++++++-------- augur/refine.py | 15 ++++- augur/traits.py | 16 ++++- .../filter/cram/filter-metadata-delimiter.t | 40 ++++++++++++- tests/io/test_metadata.py | 50 +++++++++------- 11 files changed, 209 insertions(+), 82 deletions(-) diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index ee0938cc2..3ce793229 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -9,7 +9,7 @@ from augur.argparse_ import add_command_subparsers from augur.errors import AugurError from augur.io.json import dump_ndjson, load_ndjson -from augur.io.metadata import read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv +from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.io.sequences import write_records_to_fasta from augur.types import DataErrorMethod from . import normalize_strings, passthru @@ -46,11 +46,13 @@ def create_shared_parser(): If no input options are provided, commands will try to read NDJSON records from stdin. """) shared_inputs.add_argument("--metadata", - help="Input metadata file, as CSV or TSV. Accepts '-' to read metadata from stdin.") + help="Input metadata file. Accepts '-' to read metadata from stdin.") shared_inputs.add_argument("--id-column", help="Name of the metadata column that contains the record identifier for reporting duplicate records. " "Uses the first column of the metadata file if not provided. " "Ignored if also providing a FASTA file input.") + shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", + help="Delimiters to accept when reading a metadata file.") shared_inputs.add_argument("--fasta", help="Plain or gzipped FASTA file. Headers can only contain the sequence id used to match a metadata record. " + @@ -133,15 +135,30 @@ def run(args): args.metadata = sys.stdin if args.metadata and args.fasta: - records = read_metadata_with_sequences( - args.metadata, - args.fasta, - args.seq_id_column, - args.seq_field, - DataErrorMethod(args.unmatched_reporting), - DataErrorMethod(args.duplicate_reporting)) + try: + records = read_metadata_with_sequences( + args.metadata, + args.metadata_delimiters, + args.fasta, + args.seq_id_column, + args.seq_field, + DataErrorMethod(args.unmatched_reporting), + DataErrorMethod(args.duplicate_reporting)) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) elif args.metadata: - records = read_table_to_dict(args.metadata, DataErrorMethod(args.duplicate_reporting), args.id_column) + try: + records = read_table_to_dict(args.metadata, args.metadata_delimiters, DataErrorMethod(args.duplicate_reporting), args.id_column) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) elif not sys.stdin.isatty(): records = load_ndjson(sys.stdin) else: diff --git a/augur/export_v1.py b/augur/export_v1.py index c6e2de9e6..670d5e8d6 100644 --- a/augur/export_v1.py +++ b/augur/export_v1.py @@ -9,8 +9,9 @@ from Bio import Phylo from argparse import SUPPRESS from collections import defaultdict +from .errors import AugurError from .argparse_ import ExtendAction -from .io.metadata import read_metadata +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors def convert_tree_to_json_structure(node, metadata, div=0, strains=None): @@ -311,7 +312,9 @@ def get_root_sequence(root_node, ref=None, translations=None): def add_core_args(parser): core = parser.add_argument_group("REQUIRED") core.add_argument('--tree','-t', required=True, help="tree to perform trait reconstruction on") - core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV") + core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata") + core.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file") core.add_argument('--node-data', required=True, nargs='+', action=ExtendAction, help="JSON files with meta data for each node") core.add_argument('--output-tree', help="JSON file name that is passed on to auspice (e.g., zika_tree.json).") core.add_argument('--output-meta', help="JSON file name that is passed on to auspice (e.g., zika_meta.json).") @@ -364,7 +367,14 @@ def run(args): meta_json = read_config(args.auspice_config) ensure_config_is_v1(meta_json) - meta_tsv = read_metadata(args.metadata) + try: + meta_tsv = read_metadata(args.metadata, args.metadata_delimiters) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) add_tsv_metadata_to_nodes(nodes, meta_tsv, meta_json) tree_layout(T) diff --git a/augur/export_v2.py b/augur/export_v2.py index 3f606c797..5c8d916d7 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -11,7 +11,8 @@ from Bio import Phylo from .argparse_ import ExtendAction -from .io.metadata import read_metadata +from .errors import AugurError +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .types import ValidationMode from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors from .validate import export_v2 as validate_v2, auspice_config_v2 as validate_auspice_config_v2, ValidateError @@ -867,7 +868,9 @@ def register_parser(parent_subparsers): optional_inputs = parser.add_argument_group( title="OPTIONAL INPUT FILES" ) - optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree, as CSV or TSV") + optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree") + optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file") optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`") optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)") @@ -1039,13 +1042,19 @@ def run(args): if args.metadata is not None: try: - metadata_file = read_metadata(args.metadata).to_dict(orient="index") + metadata_file = read_metadata(args.metadata, args.metadata_delimiters).to_dict(orient="index") for strain in metadata_file.keys(): if "strain" not in metadata_file[strain]: metadata_file[strain]["strain"] = strain except FileNotFoundError: print(f"ERROR: meta data file ({args.metadata}) does not exist", file=sys.stderr) sys.exit(2) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) except Exception as error: print(f"ERROR: {error}", file=sys.stderr) sys.exit(1) diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index fe1b70b02..a0366ba00 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -2,7 +2,7 @@ Filter and subsample a sequence set. """ from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT -from augur.io.metadata import VALID_ID_COLUMNS +from augur.io.metadata import DEFAULT_DELIMITERS, VALID_ID_COLUMNS from augur.types import EmptyOutputReportingMethod from . import constants @@ -14,11 +14,12 @@ def register_arguments(parser): unit tests that use this function to create argparser. """ input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered") - input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV") + input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata") input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format") input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.") input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.") input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'") + input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file.") metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata") metadata_filter_group.add_argument( diff --git a/augur/filter/_run.py b/augur/filter/_run.py index aff4a3b69..03393dded 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -15,7 +15,7 @@ DELIMITER as SEQUENCE_INDEX_DELIMITER, ) from augur.io.file import open_file -from augur.io.metadata import read_metadata +from augur.io.metadata import InvalidDelimiter, read_metadata from augur.io.sequences import read_sequences, write_sequences from augur.io.print import print_err from augur.io.vcf import is_vcf as filename_is_vcf, write_vcf @@ -163,11 +163,19 @@ def run(args): all_sequences_to_include = set() filter_counts = defaultdict(int) - metadata_reader = read_metadata( - args.metadata, - id_columns=args.metadata_id_columns, - chunk_size=args.metadata_chunk_size, - ) + try: + metadata_reader = read_metadata( + args.metadata, + args.metadata_delimiters, + id_columns=args.metadata_id_columns, + chunk_size=args.metadata_chunk_size, + ) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) for metadata in metadata_reader: duplicate_strains = ( set(metadata.index[metadata.index.duplicated()]) | @@ -309,6 +317,7 @@ def run(args): # have passed filters. metadata_reader = read_metadata( args.metadata, + args.metadata_delimiters, id_columns=args.metadata_id_columns, chunk_size=args.metadata_chunk_size, ) diff --git a/augur/frequencies.py b/augur/frequencies.py index 6dd040268..16cea8838 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -7,10 +7,11 @@ from Bio import Phylo, AlignIO from Bio.Align import MultipleSeqAlignment +from .errors import AugurError from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates -from .io.metadata import read_metadata +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .utils import read_node_data, write_json @@ -20,7 +21,9 @@ def register_parser(parent_subparsers): parser.add_argument('--method', choices=["diffusion", "kde"], required=True, help="method by which frequencies should be estimated") parser.add_argument('--metadata', type=str, required=True, metavar="FILE", - help="metadata including dates for given samples, as CSV or TSV") + help="metadata including dates for given samples") + parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file") parser.add_argument('--regions', type=str, nargs='+', default=['global'], help="region to subsample to") parser.add_argument("--pivot-interval", type=int, default=3, @@ -80,7 +83,14 @@ def format_frequencies(freq): def run(args): - metadata = read_metadata(args.metadata) + try: + metadata = read_metadata(args.metadata, args.metadata_delimiters) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) dates = get_numerical_dates(metadata, fmt='%Y-%m-%d') stiffness = args.stiffness inertia = args.inertia diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 5219cb75e..01564be4e 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -1,5 +1,6 @@ import csv import os +from typing import Iterable import pandas as pd import pyfastx import sys @@ -12,22 +13,27 @@ from .file import open_file -# Accept the following delimiters when reading a metadata file. -VALID_DELIMITERS = (',', '\t') +DEFAULT_DELIMITERS = (',', '\t') # Accept the following column names to represent a unique ID per row, in order # of preference. VALID_ID_COLUMNS = ("strain", "name") -def read_metadata(metadata_file, id_columns=VALID_ID_COLUMNS, chunk_size=None): - """Read metadata from a given filename and into a pandas `DataFrame` or +class InvalidDelimiter(Exception): + pass + + +def read_metadata(metadata_file, delimiters, id_columns=VALID_ID_COLUMNS, chunk_size=None): + r"""Read metadata from a given filename and into a pandas `DataFrame` or `TextFileReader` object. Parameters ---------- metadata_file : str Path to a metadata file to load. + delimiters : list of str + List of possible delimiters to check for between columns in the metadata. id_columns : list of str List of possible id column names to check for, ordered by priority. chunk_size : int @@ -47,19 +53,19 @@ def read_metadata(metadata_file, id_columns=VALID_ID_COLUMNS, chunk_size=None): For standard use, request a metadata file and get a pandas DataFrame. - >>> read_metadata("tests/functional/filter/data/metadata.tsv").index.values[0] + >>> read_metadata("tests/functional/filter/data/metadata.tsv", ("\t",)).index.values[0] 'COL/FLR_00024/2015' Requesting an index column that doesn't exist should produce an error. - >>> read_metadata("tests/functional/filter/data/metadata.tsv", id_columns=("Virus name",)) + >>> read_metadata("tests/functional/filter/data/metadata.tsv", ("\t",), id_columns=("Virus name",)) Traceback (most recent call last): ... Exception: None of the possible id columns (('Virus name',)) were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url') We also allow iterating through metadata in fixed chunk sizes. - >>> for chunk in read_metadata("tests/functional/filter/data/metadata.tsv", chunk_size=5): + >>> for chunk in read_metadata("tests/functional/filter/data/metadata.tsv", ("\t",), chunk_size=5): ... print(chunk.shape) ... (5, 14) @@ -68,7 +74,7 @@ def read_metadata(metadata_file, id_columns=VALID_ID_COLUMNS, chunk_size=None): """ kwargs = { - "sep": _get_delimiter(metadata_file), + "sep": _get_delimiter(metadata_file, delimiters), "engine": "c", "skipinitialspace": True, "na_filter": False, @@ -111,7 +117,7 @@ def read_metadata(metadata_file, id_columns=VALID_ID_COLUMNS, chunk_size=None): ) -def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, id_column=None): +def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ERROR_FIRST, id_column=None): """ Read rows from *table* file and yield each row as a single dict. @@ -123,6 +129,9 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i table: str Path to a CSV or TSV file or IO buffer + delimiters : list of str + List of possible delimiters to check for between columns in the metadata. + duplicate_reporting: DataErrorMethod, optional How should duplicate records be reported @@ -158,12 +167,11 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i try: # Note: this sort of duplicates _get_delimiter(), but it's easier if # this is separate since it handles non-seekable buffers. - dialect = csv.Sniffer().sniff(table_sample, VALID_DELIMITERS) - except csv.Error as err: - raise AugurError( - f"Could not determine the delimiter of {table!r}. " - "File must be a CSV or TSV." - ) from err + dialect = csv.Sniffer().sniff(table_sample, delimiters) + except csv.Error as error: + # This assumes all csv.Errors imply a delimiter issue. That might + # change in a future Python version. + raise InvalidDelimiter from error metadata_reader = csv.DictReader(handle, dialect=dialect) if duplicate_reporting is DataErrorMethod.SILENT: @@ -205,7 +213,7 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i raise ValueError(f"Encountered unhandled duplicate reporting method: {duplicate_reporting!r}") -def read_metadata_with_sequences(metadata, fasta, seq_id_column, seq_field='sequence', +def read_metadata_with_sequences(metadata, metadata_delimiters, fasta, seq_id_column, seq_field='sequence', unmatched_reporting=DataErrorMethod.ERROR_FIRST, duplicate_reporting=DataErrorMethod.ERROR_FIRST): """ Read rows from *metadata* file and yield each row as a single dict that has @@ -235,6 +243,9 @@ def read_metadata_with_sequences(metadata, fasta, seq_id_column, seq_field='sequ metadata: str Path to a CSV or TSV metadata file + metadata_delimiters : list of str + List of possible delimiters to check for between columns in the metadata. + fasta: str Path to a plain or gzipped FASTA file @@ -297,7 +308,7 @@ def read_metadata_with_sequences(metadata, fasta, seq_id_column, seq_field='sequ # Silencing duplicate reporting here because we will need to handle duplicates # in both the metadata and FASTA files after processing all the records here. - for record in read_table_to_dict(metadata, duplicate_reporting=DataErrorMethod.SILENT): + for record in read_table_to_dict(metadata, metadata_delimiters, duplicate_reporting=DataErrorMethod.SILENT): seq_id = record.get(seq_id_column) if seq_id is None: @@ -437,14 +448,13 @@ def write_records_to_tsv(records, output_file): tsv_writer.writerow(record) -def _get_delimiter(path: str): - """Get the delimiter of a file.""" +def _get_delimiter(path: str, valid_delimiters: Iterable[str]): + """Get the delimiter of a file given a list of valid delimiters.""" with open_file(path) as file: try: # Infer the delimiter from the first line. - return csv.Sniffer().sniff(file.readline(), VALID_DELIMITERS).delimiter - except csv.Error as err: - raise AugurError( - f"Could not determine the delimiter of {path!r}. " - "File must be a CSV or TSV." - ) from err + return csv.Sniffer().sniff(file.readline(), valid_delimiters).delimiter + except csv.Error as error: + # This assumes all csv.Errors imply a delimiter issue. That might + # change in a future Python version. + raise InvalidDelimiter from error diff --git a/augur/refine.py b/augur/refine.py index d27916b5b..2335f826b 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -6,7 +6,7 @@ from Bio import Phylo from .dates import get_numerical_dates from .dates.errors import InvalidYearBounds -from .io.metadata import read_metadata +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .utils import read_tree, write_json, InvalidTreeError from .errors import AugurError from treetime.vcf_utils import read_vcf @@ -98,7 +98,9 @@ def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("refine", help=__doc__) parser.add_argument('--alignment', '-a', help="alignment in fasta or VCF format") parser.add_argument('--tree', '-t', required=True, help="prebuilt Newick") - parser.add_argument('--metadata', type=str, metavar="FILE", help="sequence metadata, as CSV or TSV") + parser.add_argument('--metadata', type=str, metavar="FILE", help="sequence metadata") + parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file") parser.add_argument('--output-tree', type=str, help='file name to write tree to') parser.add_argument('--output-node-data', type=str, help='file name to write branch lengths as node data') parser.add_argument('--use-fft', action="store_true", help="produce timetree using FFT for convolutions") @@ -204,7 +206,14 @@ def run(args): if args.metadata is None: print("ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr) return 1 - metadata = read_metadata(args.metadata) + try: + metadata = read_metadata(args.metadata, args.metadata_delimiters) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) try: dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) diff --git a/augur/traits.py b/augur/traits.py index c9f812ff4..0a423571b 100644 --- a/augur/traits.py +++ b/augur/traits.py @@ -6,7 +6,8 @@ from collections import defaultdict import os, sys import pandas as pd -from .io.metadata import read_metadata +from .errors import AugurError +from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata from .utils import write_json, get_json_name TINY = 1e-12 @@ -100,7 +101,9 @@ def mugration_inference(tree=None, seq_meta=None, field='country', confidence=Tr def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("traits", help=__doc__) parser.add_argument('--tree', '-t', required=True, help="tree to perform trait reconstruction on") - parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata, as CSV or TSV") + parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata") + parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", + help="delimiters to accept when reading a metadata file") parser.add_argument('--weights', required=False, help="tsv/csv table with equilibrium probabilities of discrete states") parser.add_argument('--columns', required=True, nargs='+', help='metadata fields to perform discrete reconstruction on') @@ -126,7 +129,14 @@ def run(args): command line arguments are parsed by argparse """ tree_fname = args.tree - traits = read_metadata(args.metadata) + try: + traits = read_metadata(args.metadata, args.metadata_delimiters) + except InvalidDelimiter: + raise AugurError( + f"Could not determine the delimiter of {args.metadata!r}. " + f"Valid delimiters are: {args.metadata_delimiters!r}. " + "This can be changed with --metadata-delimiters." + ) from Bio import Phylo T = Phylo.read(tree_fname, 'newick') diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t index ffce8207b..971a6c5e6 100644 --- a/tests/functional/filter/cram/filter-metadata-delimiter.t +++ b/tests/functional/filter/cram/filter-metadata-delimiter.t @@ -2,7 +2,7 @@ Setup $ source "$TESTDIR"/_setup.sh -Comma-delimited metadata is allowed. However, the output metadata will be tab-delimited. +Comma-delimited metadata is allowed by default. However, the output metadata will be tab-delimited. $ cat >metadata.txt <<~~ > strain,column @@ -18,7 +18,7 @@ Comma-delimited metadata is allowed. However, the output metadata will be tab-de strain\tcolumn (esc) SEQ_2\tB (esc) -Colon-delimited metadata is not allowed. +Colon-delimited metadata is not allowed by default. $ cat >metadata.txt <<~~ > strain:column @@ -30,5 +30,39 @@ Colon-delimited metadata is not allowed. > --metadata metadata.txt \ > --exclude-where column=A \ > --output-metadata filtered.txt > /dev/null - ERROR: Could not determine the delimiter of 'metadata.txt'. File must be a CSV or TSV. + ERROR: Could not determine the delimiter of 'metadata.txt'. Valid delimiters are: (',', '\t'). This can be changed with --metadata-delimiters. [2] + +Pass the default valid delimiters explicitly in reverse order. +Note: this shows how to specify a tab character in the list, though it shouldn't be necessary for most users. + + $ cat >metadata.txt <<~~ + > strain:column + > SEQ_1:A + > SEQ_2:B + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.txt \ + > --metadata-delimiters $'\t' ',' \ + > --exclude-where column=A \ + > --output-metadata filtered.txt > /dev/null + ERROR: Could not determine the delimiter of 'metadata.txt'. Valid delimiters are: ['\t', ',']. This can be changed with --metadata-delimiters. + [2] + +Allow colon-delimited metadata. However, the output metadata will be tab-delimited. + + $ cat >metadata.txt <<~~ + > strain:column + > SEQ_1:A + > SEQ_2:B + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.txt \ + > --metadata-delimiters ':' \ + > --exclude-where column=A \ + > --output-metadata filtered.txt > /dev/null + $ cat filtered.txt + strain\tcolumn (esc) + SEQ_2\tB (esc) diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 42aae001a..9d77067fc 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -4,7 +4,7 @@ from io import StringIO from augur.errors import AugurError -from augur.io.metadata import read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv +from augur.io.metadata import InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.types import DataErrorMethod @@ -39,13 +39,13 @@ def test_read_table_to_dict_with_csv(self, tmpdir, expected_record): fh.write('strain,date,country\n') fh.write('SEQ_A,2020-10-03,USA\n') - record = next(read_table_to_dict(path)) + record = next(read_table_to_dict(path, (','))) assert record == expected_record def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record): stdin = StringIO('strain,date,country\nSEQ_A,2020-10-03,USA\n') mp_context.setattr('sys.stdin', stdin) - record = next(read_table_to_dict(sys.stdin)) + record = next(read_table_to_dict(sys.stdin, (','))) assert record == expected_record def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): @@ -54,13 +54,13 @@ def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): fh.write('strain\tdate\tcountry\n') fh.write('SEQ_A\t2020-10-03\tUSA\n') - record = next(read_table_to_dict(path)) + record = next(read_table_to_dict(path, ('\t'))) assert record == expected_record def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record): stdin = StringIO('strain\tdate\tcountry\nSEQ_A\t2020-10-03\tUSA\n') mp_context.setattr('sys.stdin', stdin) - record = next(read_table_to_dict(sys.stdin)) + record = next(read_table_to_dict(sys.stdin, ('\t'))) assert record == expected_record def test_read_table_to_dict_with_bad_delimiter(self, tmpdir): @@ -69,26 +69,24 @@ def test_read_table_to_dict_with_bad_delimiter(self, tmpdir): fh.write('strain date country\n') fh.write('SEQ_A 2020-10-03 USA\n') - with pytest.raises(AugurError) as e_info: - next(read_table_to_dict(path)) - - assert str(e_info.value) == f"Could not determine the delimiter of {path!r}. File must be a CSV or TSV." + with pytest.raises(InvalidDelimiter): + next(read_table_to_dict(path, (',', '\t'))) @pytest.mark.parametrize('id_column', ['strain', None]) def test_read_table_to_dict_with_duplicates(self, metadata_with_duplicate, id_column): with pytest.raises(AugurError) as e_info: - list(read_table_to_dict(metadata_with_duplicate, id_column=id_column)) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), id_column=id_column)) assert str(e_info.value) == f"Encountered record with duplicate id 'SEQ_A' in {metadata_with_duplicate!r}" @pytest.mark.parametrize('id_column', ['strain', None]) def test_read_table_to_dict_with_duplicates_error_all(self, metadata_with_duplicate, id_column): with pytest.raises(AugurError) as e_info: - list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod("error_all"), id_column=id_column)) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), DataErrorMethod("error_all"), id_column=id_column)) assert str(e_info.value) == f"The following records are duplicated in {metadata_with_duplicate!r}:\n'SEQ_A'\n'SEQ_B'" @pytest.mark.parametrize('id_column', ['strain', None]) def test_read_table_to_dict_with_duplicates_warning(self, capsys, metadata_with_duplicate, id_column): - list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod('warn'), id_column=id_column)) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), DataErrorMethod('warn'), id_column=id_column)) captured = capsys.readouterr() assert captured.err == ( f"WARNING: Encountered record with duplicate id 'SEQ_A' in {metadata_with_duplicate!r}\n" @@ -97,13 +95,13 @@ def test_read_table_to_dict_with_duplicates_warning(self, capsys, metadata_with_ ) def test_read_table_to_dict_with_duplicates_silent(self, capsys, metadata_with_duplicate): - list(read_table_to_dict(metadata_with_duplicate, DataErrorMethod('silent'))) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), DataErrorMethod('silent'))) assert "WARNING" not in capsys.readouterr().err def test_read_table_to_dict_with_duplicate_and_bad_id(self, metadata_with_duplicate): id_column = "bad_id" with pytest.raises(AugurError) as e_info: - list(read_table_to_dict(metadata_with_duplicate, id_column=id_column)) + list(read_table_to_dict(metadata_with_duplicate, ('\t'), id_column=id_column)) assert str(e_info.value) == f"The provided id column {id_column!r} does not exist in {metadata_with_duplicate!r}." @@ -194,7 +192,7 @@ def metadata_with_unmatched_and_dup(tmpdir, metadata_file): class TestReadMetadataWithSequence: def test_read_metadata_with_sequence(self, metadata_file, fasta_file): - records = list(read_metadata_with_sequences(metadata_file, fasta_file, 'strain')) + records = list(read_metadata_with_sequences(metadata_file, ('\t',), fasta_file, 'strain')) assert len(records) == 4 for record in records: seq_base = record['strain'].split("_")[-1].upper() @@ -204,18 +202,19 @@ def test_read_metadata_with_sequence(self, metadata_file, fasta_file): def test_read_metadata_with_sequences_with_bad_id(self, metadata_file, fasta_file): id_field = "bad_id" with pytest.raises(AugurError) as e_info: - next(read_metadata_with_sequences(metadata_file, fasta_file, id_field)) + next(read_metadata_with_sequences(metadata_file, ('\t',), fasta_file, id_field)) assert str(e_info.value) == f"The provided sequence id column {id_field!r} does not exist in the metadata." def test_read_metadata_with_sequences_with_unmatched(self, metadata_with_unmatched, fasta_with_unmatched): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_with_unmatched, fasta_with_unmatched, 'strain')) + list(read_metadata_with_sequences(metadata_with_unmatched, ('\t',), fasta_with_unmatched, 'strain')) assert str(e_info.value) == "Encountered metadata record 'EXTRA_METADATA_A' without a matching sequence." def test_read_metadata_with_sequences_with_unmatched_error_all(self, metadata_with_unmatched, fasta_with_unmatched): with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_unmatched, + ('\t',), fasta_with_unmatched, 'strain', unmatched_reporting=DataErrorMethod.ERROR_ALL)) @@ -231,6 +230,7 @@ def test_read_metadata_with_sequences_with_unmatched_error_all(self, metadata_wi def test_read_metadata_with_sequences_with_unmatched_warning(self, capsys, metadata_with_unmatched, fasta_with_unmatched): records = list(read_metadata_with_sequences( metadata_with_unmatched, + ('\t',), fasta_with_unmatched, 'strain', unmatched_reporting=DataErrorMethod.WARN)) @@ -251,6 +251,7 @@ def test_read_metadata_with_sequences_with_unmatched_warning(self, capsys, metad def test_read_metadata_with_sequences_with_unmatched_silent(self, capsys, metadata_with_unmatched, fasta_with_unmatched): records = list(read_metadata_with_sequences( metadata_with_unmatched, + ('\t',), fasta_with_unmatched, 'strain', unmatched_reporting=DataErrorMethod.SILENT)) @@ -260,17 +261,17 @@ def test_read_metadata_with_sequences_with_unmatched_silent(self, capsys, metada def test_read_metadata_with_sequences_with_dup_metadata(self, metadata_with_dup, fasta_file): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_with_dup, fasta_file, 'strain')) + list(read_metadata_with_sequences(metadata_with_dup, ('\t',), fasta_file, 'strain')) assert str(e_info.value) == "Encountered metadata record with duplicate id 'SEQ_C'." def test_read_metadata_with_sequences_with_dup_fasta(self, metadata_file, fasta_with_dup): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_file, fasta_with_dup, 'strain')) + list(read_metadata_with_sequences(metadata_file, ('\t',), fasta_with_dup, 'strain')) assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'." def test_read_metadata_with_sequences_with_dup_both(self, metadata_with_dup, fasta_with_dup): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_with_dup, fasta_with_dup, 'strain')) + list(read_metadata_with_sequences(metadata_with_dup, ('\t',), fasta_with_dup, 'strain')) # Expected to error on first duplicate sequence since we check sequences first assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'." @@ -278,6 +279,7 @@ def test_read_metadata_with_sequences_with_dup_error_all(self, metadata_with_dup with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_dup, + ('\t',), fasta_with_dup, 'strain', duplicate_reporting=DataErrorMethod.ERROR_ALL @@ -294,6 +296,7 @@ def test_read_metadata_with_sequences_with_dup_error_all(self, metadata_with_dup def test_read_metadata_with_sequences_with_dup_warn(self, capsys, metadata_with_dup, fasta_with_dup): records = list(read_metadata_with_sequences( metadata_with_dup, + ('\t',), fasta_with_dup, 'strain', duplicate_reporting=DataErrorMethod.WARN @@ -317,6 +320,7 @@ def test_read_metadata_with_sequences_with_dup_warn(self, capsys, metadata_with_ def test_read_metadata_with_sequences_with_dup_silent(self, capsys, metadata_with_dup, fasta_with_dup): records = list(read_metadata_with_sequences( metadata_with_dup, + ('\t',), fasta_with_dup, 'strain', duplicate_reporting=DataErrorMethod.SILENT @@ -327,7 +331,7 @@ def test_read_metadata_with_sequences_with_dup_silent(self, capsys, metadata_wit def test_read_metadata_with_sequences_with_extra_and_dup(self, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup): with pytest.raises(AugurError) as e_info: - list(read_metadata_with_sequences(metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup, 'strain')) + list(read_metadata_with_sequences(metadata_with_unmatched_and_dup, ('\t',), fasta_with_unmatched_and_dup, 'strain')) # Expected to error on first duplicate sequence since we check duplicate sequences first assert str(e_info.value) == "Encountered sequence record with duplicate id 'SEQ_A'." @@ -335,6 +339,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_error_all(self, metadat with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_unmatched_and_dup, + ('\t',), fasta_with_unmatched_and_dup, 'strain', unmatched_reporting=DataErrorMethod.ERROR_ALL, @@ -358,6 +363,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_unmatched(self, ca with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_unmatched_and_dup, + ('\t',), fasta_with_unmatched_and_dup, 'strain', unmatched_reporting=DataErrorMethod.WARN, @@ -387,6 +393,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_dups(self, capsys, with pytest.raises(AugurError) as e_info: list(read_metadata_with_sequences( metadata_with_unmatched_and_dup, + ('\t',), fasta_with_unmatched_and_dup, 'strain', unmatched_reporting=DataErrorMethod.ERROR_ALL, @@ -418,6 +425,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_dups(self, capsys, def test_read_metadata_with_sequences_with_extra_and_dup_warn_both(self, capsys, metadata_with_unmatched_and_dup, fasta_with_unmatched_and_dup): records = list(read_metadata_with_sequences( metadata_with_unmatched_and_dup, + ('\t',), fasta_with_unmatched_and_dup, 'strain', unmatched_reporting=DataErrorMethod.WARN, From 1ac4649bc6044bc1333b0c07a8af70c815d451b5 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 6 Apr 2023 13:25:20 -0700 Subject: [PATCH 2/5] Update changelog for metadata delimiter changes --- CHANGES.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 79f52910e..4028bd9a3 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,12 +4,14 @@ ### Major Changes -* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this supported any arbitrary delimiters for the metadata. It is now restricted to CSV and TSV, which are the officially supported formats for all Augur subcommands that use this function. [#812][] (@victorlin) +* export, filter, frequencies, refine, traits: From versions 10.0.0 through 21.1.0, arbitrary delimiters for `--metadata` were supported due to internal implementation differences from the advertised CSV and TSV support. Starting with this version, non-CSV/TSV files will no longer be supported by default. To adjust for this breaking change, specify custom delimiters with the new `--metadata-delimiters` flag. [#1196][] (@victorlin) +* `augur.io.read_metadata`: Previously, this supported any arbitrary delimiters for the metadata. It now requires a new argument, `valid_delimiters`. [#812][] (@victorlin) ### Features * Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer) * `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin) +* curate: Allow custom metadata delimiters with the new `--metadata-delimiters` flag. [#1196][] (@victorlin) ### Bug fixes @@ -21,6 +23,7 @@ [#1152]: https://github.com/nextstrain/augur/pull/1152 [#1171]: https://github.com/nextstrain/augur/issues/1171 [#1178]: https://github.com/nextstrain/augur/pull/1178 +[#1196]: https://github.com/nextstrain/augur/pull/1196 [`pandas.read_csv()`]: https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.read_csv.html ## 21.1.0 (14 March 2023) From 900e43f4fc98e94f2620da3be1146e62edd424c2 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 12 Apr 2023 10:49:18 -0700 Subject: [PATCH 3/5] filter: Remove example ID columns from help text These examples are already shown in the help text with the default value. --- augur/filter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index a0366ba00..f99863818 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -18,7 +18,7 @@ def register_arguments(parser): input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format") input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.") input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.") - input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'") + input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information.") input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file.") metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata") From 4e960edccf4f53430d68d3f01757f222bc11cc56 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 12 Apr 2023 10:54:23 -0700 Subject: [PATCH 4/5] Clarify single-value inference of some options The previous wording of these option descriptions did not rule out using multiple values at once. This change makes it clear that only one value is inferred. --- augur/curate/__init__.py | 2 +- augur/export_v1.py | 2 +- augur/export_v2.py | 2 +- augur/filter/__init__.py | 4 ++-- augur/frequencies.py | 2 +- augur/io/metadata.py | 3 +++ augur/refine.py | 2 +- augur/traits.py | 2 +- 8 files changed, 11 insertions(+), 8 deletions(-) diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index 3ce793229..257ce2ff5 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -52,7 +52,7 @@ def create_shared_parser(): "Uses the first column of the metadata file if not provided. " "Ignored if also providing a FASTA file input.") shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", - help="Delimiters to accept when reading a metadata file.") + help="Delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") shared_inputs.add_argument("--fasta", help="Plain or gzipped FASTA file. Headers can only contain the sequence id used to match a metadata record. " + diff --git a/augur/export_v1.py b/augur/export_v1.py index 670d5e8d6..5b7d7f506 100644 --- a/augur/export_v1.py +++ b/augur/export_v1.py @@ -314,7 +314,7 @@ def add_core_args(parser): core.add_argument('--tree','-t', required=True, help="tree to perform trait reconstruction on") core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata") core.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", - help="delimiters to accept when reading a metadata file") + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") core.add_argument('--node-data', required=True, nargs='+', action=ExtendAction, help="JSON files with meta data for each node") core.add_argument('--output-tree', help="JSON file name that is passed on to auspice (e.g., zika_tree.json).") core.add_argument('--output-meta', help="JSON file name that is passed on to auspice (e.g., zika_meta.json).") diff --git a/augur/export_v2.py b/augur/export_v2.py index 5c8d916d7..68398ed56 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -870,7 +870,7 @@ def register_parser(parent_subparsers): ) optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree") optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", - help="delimiters to accept when reading a metadata file") + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`") optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)") diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index f99863818..7970f8a02 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -18,8 +18,8 @@ def register_arguments(parser): input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format") input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.") input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.") - input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information.") - input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file.") + input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information. Only one ID column will be inferred.") + input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata") metadata_filter_group.add_argument( diff --git a/augur/frequencies.py b/augur/frequencies.py index 16cea8838..ced5211de 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -23,7 +23,7 @@ def register_parser(parent_subparsers): parser.add_argument('--metadata', type=str, required=True, metavar="FILE", help="metadata including dates for given samples") parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", - help="delimiters to accept when reading a metadata file") + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") parser.add_argument('--regions', type=str, nargs='+', default=['global'], help="region to subsample to") parser.add_argument("--pivot-interval", type=int, default=3, diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 01564be4e..414224dc9 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -34,8 +34,10 @@ def read_metadata(metadata_file, delimiters, id_columns=VALID_ID_COLUMNS, chunk_ Path to a metadata file to load. delimiters : list of str List of possible delimiters to check for between columns in the metadata. + Only one delimiter will be inferred. id_columns : list of str List of possible id column names to check for, ordered by priority. + Only one id column will be inferred. chunk_size : int Size of chunks to stream from disk with an iterator instead of loading the entire input file into memory. @@ -131,6 +133,7 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER delimiters : list of str List of possible delimiters to check for between columns in the metadata. + Only one delimiter will be inferred. duplicate_reporting: DataErrorMethod, optional How should duplicate records be reported diff --git a/augur/refine.py b/augur/refine.py index 2335f826b..30dab6ee4 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -100,7 +100,7 @@ def register_parser(parent_subparsers): parser.add_argument('--tree', '-t', required=True, help="prebuilt Newick") parser.add_argument('--metadata', type=str, metavar="FILE", help="sequence metadata") parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", - help="delimiters to accept when reading a metadata file") + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") parser.add_argument('--output-tree', type=str, help='file name to write tree to') parser.add_argument('--output-node-data', type=str, help='file name to write branch lengths as node data') parser.add_argument('--use-fft', action="store_true", help="produce timetree using FFT for convolutions") diff --git a/augur/traits.py b/augur/traits.py index 0a423571b..5dfe4dc39 100644 --- a/augur/traits.py +++ b/augur/traits.py @@ -103,7 +103,7 @@ def register_parser(parent_subparsers): parser.add_argument('--tree', '-t', required=True, help="tree to perform trait reconstruction on") parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata") parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", - help="delimiters to accept when reading a metadata file") + help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") parser.add_argument('--weights', required=False, help="tsv/csv table with equilibrium probabilities of discrete states") parser.add_argument('--columns', required=True, nargs='+', help='metadata fields to perform discrete reconstruction on') From c0a9ddb9e132a89909234c003b501fe34111708f Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 12 Apr 2023 11:16:30 -0700 Subject: [PATCH 5/5] filter: Clarify behavior of --metadata-id-columns Copy useful wording from the corresponding parameter description in augur.io.metadata.read_metadata(). --- augur/filter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index 7970f8a02..24ef547d3 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -18,7 +18,7 @@ def register_arguments(parser): input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format") input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.") input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.") - input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information. Only one ID column will be inferred.") + input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")