From 9ca0c35445ea339ff1358817d1049c7f6c0f22e1 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 14 Jun 2023 11:49:20 -0700 Subject: [PATCH 1/3] read_metadata: Pass delimiters argument by key Since it is not a positional argument. --- augur/export_v1.py | 2 +- augur/export_v2.py | 2 +- augur/filter/_run.py | 4 ++-- augur/frequencies.py | 2 +- augur/refine.py | 2 +- augur/traits.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/augur/export_v1.py b/augur/export_v1.py index 93a6c0368..04808f2bd 100644 --- a/augur/export_v1.py +++ b/augur/export_v1.py @@ -368,7 +368,7 @@ def run(args): meta_json = read_config(args.auspice_config) ensure_config_is_v1(meta_json) try: - meta_tsv = read_metadata(args.metadata, args.metadata_delimiters) + meta_tsv = read_metadata(args.metadata, delimiters=args.metadata_delimiters) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " diff --git a/augur/export_v2.py b/augur/export_v2.py index a1fea99ea..dbbb8998d 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -1072,7 +1072,7 @@ def run(args): if args.metadata is not None: try: - metadata_file = read_metadata(args.metadata, args.metadata_delimiters).to_dict(orient="index") + metadata_file = read_metadata(args.metadata, delimiters=args.metadata_delimiters).to_dict(orient="index") for strain in metadata_file.keys(): if "strain" not in metadata_file[strain]: metadata_file[strain]["strain"] = strain diff --git a/augur/filter/_run.py b/augur/filter/_run.py index 03393dded..d89a38f38 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -166,7 +166,7 @@ def run(args): try: metadata_reader = read_metadata( args.metadata, - args.metadata_delimiters, + delimiters=args.metadata_delimiters, id_columns=args.metadata_id_columns, chunk_size=args.metadata_chunk_size, ) @@ -317,7 +317,7 @@ def run(args): # have passed filters. metadata_reader = read_metadata( args.metadata, - args.metadata_delimiters, + delimiters=args.metadata_delimiters, id_columns=args.metadata_id_columns, chunk_size=args.metadata_chunk_size, ) diff --git a/augur/frequencies.py b/augur/frequencies.py index ced5211de..f484b6c3a 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -84,7 +84,7 @@ def format_frequencies(freq): def run(args): try: - metadata = read_metadata(args.metadata, args.metadata_delimiters) + metadata = read_metadata(args.metadata, delimiters=args.metadata_delimiters) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " diff --git a/augur/refine.py b/augur/refine.py index 528809657..7d2ac51a0 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -212,7 +212,7 @@ def run(args): print("ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr) return 1 try: - metadata = read_metadata(args.metadata, args.metadata_delimiters) + metadata = read_metadata(args.metadata, delimiters=args.metadata_delimiters) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " diff --git a/augur/traits.py b/augur/traits.py index 5dfe4dc39..c57d64c11 100644 --- a/augur/traits.py +++ b/augur/traits.py @@ -130,7 +130,7 @@ def run(args): """ tree_fname = args.tree try: - traits = read_metadata(args.metadata, args.metadata_delimiters) + traits = read_metadata(args.metadata, delimiters=args.metadata_delimiters) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " From 42f6cb2c8e23f496e56a2e803bb222b5076c1db3 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 14 Jun 2023 13:24:13 -0700 Subject: [PATCH 2/3] read_metadata: Allow --metadata-id-columns in downstream subcommands Previously, this flag to customize the default value was only available in augur filter. Add it to other subcommands to parallel the existing support for --metadata-valid-delimiters which serves a similar purpose. --- augur/export_v1.py | 9 +++++++-- augur/export_v2.py | 9 +++++++-- augur/frequencies.py | 6 ++++-- augur/refine.py | 9 +++++++-- augur/traits.py | 9 +++++++-- 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/augur/export_v1.py b/augur/export_v1.py index 04808f2bd..70de3124b 100644 --- a/augur/export_v1.py +++ b/augur/export_v1.py @@ -11,7 +11,7 @@ from collections import defaultdict from .errors import AugurError from .argparse_ import ExtendAction -from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata +from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors def convert_tree_to_json_structure(node, metadata, div=0, strains=None): @@ -315,6 +315,8 @@ def add_core_args(parser): core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata") core.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") + core.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", + help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") core.add_argument('--node-data', required=True, nargs='+', action=ExtendAction, help="JSON files with meta data for each node") core.add_argument('--output-tree', help="JSON file name that is passed on to auspice (e.g., zika_tree.json).") core.add_argument('--output-meta', help="JSON file name that is passed on to auspice (e.g., zika_meta.json).") @@ -368,7 +370,10 @@ def run(args): meta_json = read_config(args.auspice_config) ensure_config_is_v1(meta_json) try: - meta_tsv = read_metadata(args.metadata, delimiters=args.metadata_delimiters) + meta_tsv = read_metadata( + args.metadata, + delimiters=args.metadata_delimiters, + id_columns=args.metadata_id_columns) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " diff --git a/augur/export_v2.py b/augur/export_v2.py index dbbb8998d..1a7c884ad 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -12,7 +12,7 @@ from .argparse_ import ExtendAction from .errors import AugurError -from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata +from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata from .types import ValidationMode from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors from .validate import export_v2 as validate_v2, auspice_config_v2 as validate_auspice_config_v2, ValidateError @@ -852,6 +852,8 @@ def register_parser(parent_subparsers): optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree") optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") + optional_inputs.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", + help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`") optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)") @@ -1072,7 +1074,10 @@ def run(args): if args.metadata is not None: try: - metadata_file = read_metadata(args.metadata, delimiters=args.metadata_delimiters).to_dict(orient="index") + metadata_file = read_metadata( + args.metadata, + delimiters=args.metadata_delimiters, + id_columns=args.metadata_id_columns).to_dict(orient="index") for strain in metadata_file.keys(): if "strain" not in metadata_file[strain]: metadata_file[strain]["strain"] = strain diff --git a/augur/frequencies.py b/augur/frequencies.py index f484b6c3a..a6b18540d 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -11,7 +11,7 @@ from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates -from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata +from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata from .utils import read_node_data, write_json @@ -24,6 +24,8 @@ def register_parser(parent_subparsers): help="metadata including dates for given samples") parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") + parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", + help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--regions', type=str, nargs='+', default=['global'], help="region to subsample to") parser.add_argument("--pivot-interval", type=int, default=3, @@ -84,7 +86,7 @@ def format_frequencies(freq): def run(args): try: - metadata = read_metadata(args.metadata, delimiters=args.metadata_delimiters) + metadata = read_metadata(args.metadata, delimiters=args.metadata_delimiters, id_columns=args.metadata_id_columns) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " diff --git a/augur/refine.py b/augur/refine.py index 7d2ac51a0..343dcee59 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -6,7 +6,7 @@ from Bio import Phylo from .dates import get_numerical_dates from .dates.errors import InvalidYearBounds -from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata +from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata from .utils import read_tree, write_json, InvalidTreeError from .errors import AugurError from treetime.vcf_utils import read_vcf @@ -102,6 +102,8 @@ def register_parser(parent_subparsers): parser.add_argument('--metadata', type=str, metavar="FILE", help="sequence metadata") parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") + parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", + help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--output-tree', type=str, help='file name to write tree to') parser.add_argument('--output-node-data', type=str, help='file name to write branch lengths as node data') parser.add_argument('--use-fft', action="store_true", help="produce timetree using FFT for convolutions") @@ -212,7 +214,10 @@ def run(args): print("ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr) return 1 try: - metadata = read_metadata(args.metadata, delimiters=args.metadata_delimiters) + metadata = read_metadata( + args.metadata, + delimiters=args.metadata_delimiters, + id_columns=args.metadata_id_columns) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " diff --git a/augur/traits.py b/augur/traits.py index c57d64c11..6695de1c5 100644 --- a/augur/traits.py +++ b/augur/traits.py @@ -7,7 +7,7 @@ import os, sys import pandas as pd from .errors import AugurError -from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata +from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata from .utils import write_json, get_json_name TINY = 1e-12 @@ -104,6 +104,8 @@ def register_parser(parent_subparsers): parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata") parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") + parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", + help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--weights', required=False, help="tsv/csv table with equilibrium probabilities of discrete states") parser.add_argument('--columns', required=True, nargs='+', help='metadata fields to perform discrete reconstruction on') @@ -130,7 +132,10 @@ def run(args): """ tree_fname = args.tree try: - traits = read_metadata(args.metadata, delimiters=args.metadata_delimiters) + traits = read_metadata( + args.metadata, + delimiters=args.metadata_delimiters, + id_columns=args.metadata_id_columns) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " From 0b64f5e442703143a6edc52143fcae6ef4d5cae9 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 14 Jun 2023 13:36:00 -0700 Subject: [PATCH 3/3] Update changelog --- CHANGES.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 045359ae0..47449488b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,11 @@ ## __NEXT__ +### Features + +* export, frequencies, refine, traits: Add a new flag `--metadata-id-columns` to customize the possible metadata ID columns. Previously, this was only available in `augur filter`. [#1240][] (@victorlin) + +[#1240]: https://github.com/nextstrain/augur/pull/1240 ## 22.0.2 (26 May 2023)