Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow customization of input metadata delimiter #1196

Merged
merged 5 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

### Major Changes

* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this supported any arbitrary delimiters for the metadata. It is now restricted to CSV and TSV, which are the officially supported formats for all Augur subcommands that use this function. [#812][] (@victorlin)
* export, filter, frequencies, refine, traits: From versions 10.0.0 through 21.1.0, arbitrary delimiters for `--metadata` were supported due to internal implementation differences from the advertised CSV and TSV support. Starting with this version, non-CSV/TSV files will no longer be supported by default. To adjust for this breaking change, specify custom delimiters with the new `--metadata-delimiters` flag. [#1196][] (@victorlin)
* `augur.io.read_metadata`: Previously, this supported any arbitrary delimiters for the metadata. It now requires a new argument, `valid_delimiters`. [#812][] (@victorlin)

### Features

* Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer)
* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin)
* curate: Allow custom metadata delimiters with the new `--metadata-delimiters` flag. [#1196][] (@victorlin)

### Bug fixes

Expand All @@ -21,6 +23,7 @@
[#1152]: https://github.com/nextstrain/augur/pull/1152
[#1171]: https://github.com/nextstrain/augur/issues/1171
[#1178]: https://github.com/nextstrain/augur/pull/1178
[#1196]: https://github.com/nextstrain/augur/pull/1196
[`pandas.read_csv()`]: https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.read_csv.html

## 21.1.0 (14 March 2023)
Expand Down
37 changes: 27 additions & 10 deletions augur/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from augur.argparse_ import add_command_subparsers
from augur.errors import AugurError
from augur.io.json import dump_ndjson, load_ndjson
from augur.io.metadata import read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
from augur.io.sequences import write_records_to_fasta
from augur.types import DataErrorMethod
from . import normalize_strings, passthru
Expand Down Expand Up @@ -46,11 +46,13 @@ def create_shared_parser():
If no input options are provided, commands will try to read NDJSON records from stdin.
""")
shared_inputs.add_argument("--metadata",
help="Input metadata file, as CSV or TSV. Accepts '-' to read metadata from stdin.")
help="Input metadata file. Accepts '-' to read metadata from stdin.")
shared_inputs.add_argument("--id-column",
help="Name of the metadata column that contains the record identifier for reporting duplicate records. "
"Uses the first column of the metadata file if not provided. "
"Ignored if also providing a FASTA file input.")
shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+",
help="Delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")

shared_inputs.add_argument("--fasta",
help="Plain or gzipped FASTA file. Headers can only contain the sequence id used to match a metadata record. " +
Expand Down Expand Up @@ -133,15 +135,30 @@ def run(args):
args.metadata = sys.stdin

if args.metadata and args.fasta:
records = read_metadata_with_sequences(
args.metadata,
args.fasta,
args.seq_id_column,
args.seq_field,
DataErrorMethod(args.unmatched_reporting),
DataErrorMethod(args.duplicate_reporting))
try:
records = read_metadata_with_sequences(
args.metadata,
args.metadata_delimiters,
args.fasta,
args.seq_id_column,
args.seq_field,
DataErrorMethod(args.unmatched_reporting),
DataErrorMethod(args.duplicate_reporting))
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
elif args.metadata:
records = read_table_to_dict(args.metadata, DataErrorMethod(args.duplicate_reporting), args.id_column)
try:
records = read_table_to_dict(args.metadata, args.metadata_delimiters, DataErrorMethod(args.duplicate_reporting), args.id_column)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
elif not sys.stdin.isatty():
records = load_ndjson(sys.stdin)
else:
Expand Down
16 changes: 13 additions & 3 deletions augur/export_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
from Bio import Phylo
from argparse import SUPPRESS
from collections import defaultdict
from .errors import AugurError
from .argparse_ import ExtendAction
from .io.metadata import read_metadata
from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors

def convert_tree_to_json_structure(node, metadata, div=0, strains=None):
Expand Down Expand Up @@ -311,7 +312,9 @@ def get_root_sequence(root_node, ref=None, translations=None):
def add_core_args(parser):
core = parser.add_argument_group("REQUIRED")
core.add_argument('--tree','-t', required=True, help="tree to perform trait reconstruction on")
core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
core.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
core.add_argument('--node-data', required=True, nargs='+', action=ExtendAction, help="JSON files with meta data for each node")
core.add_argument('--output-tree', help="JSON file name that is passed on to auspice (e.g., zika_tree.json).")
core.add_argument('--output-meta', help="JSON file name that is passed on to auspice (e.g., zika_meta.json).")
Expand Down Expand Up @@ -364,7 +367,14 @@ def run(args):

meta_json = read_config(args.auspice_config)
ensure_config_is_v1(meta_json)
meta_tsv = read_metadata(args.metadata)
try:
meta_tsv = read_metadata(args.metadata, args.metadata_delimiters)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
add_tsv_metadata_to_nodes(nodes, meta_tsv, meta_json)

tree_layout(T)
Expand Down
15 changes: 12 additions & 3 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from Bio import Phylo

from .argparse_ import ExtendAction
from .io.metadata import read_metadata
from .errors import AugurError
from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
from .types import ValidationMode
from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors
from .validate import export_v2 as validate_v2, auspice_config_v2 as validate_auspice_config_v2, ValidateError
Expand Down Expand Up @@ -867,7 +868,9 @@ def register_parser(parent_subparsers):
optional_inputs = parser.add_argument_group(
title="OPTIONAL INPUT FILES"
)
optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree, as CSV or TSV")
optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree")
optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`")
optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)")

Expand Down Expand Up @@ -1039,13 +1042,19 @@ def run(args):

if args.metadata is not None:
try:
metadata_file = read_metadata(args.metadata).to_dict(orient="index")
metadata_file = read_metadata(args.metadata, args.metadata_delimiters).to_dict(orient="index")
for strain in metadata_file.keys():
if "strain" not in metadata_file[strain]:
metadata_file[strain]["strain"] = strain
except FileNotFoundError:
print(f"ERROR: meta data file ({args.metadata}) does not exist", file=sys.stderr)
sys.exit(2)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
except Exception as error:
print(f"ERROR: {error}", file=sys.stderr)
sys.exit(1)
Expand Down
7 changes: 4 additions & 3 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Filter and subsample a sequence set.
"""
from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from augur.io.metadata import VALID_ID_COLUMNS
from augur.io.metadata import DEFAULT_DELIMITERS, VALID_ID_COLUMNS
from augur.types import EmptyOutputReportingMethod
from . import constants

Expand All @@ -14,11 +14,12 @@ def register_arguments(parser):
unit tests that use this function to create argparser.
"""
input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata, as CSV or TSV")
input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'")
input_group.add_argument('--metadata-id-columns', default=VALID_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")

metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
metadata_filter_group.add_argument(
Expand Down
21 changes: 15 additions & 6 deletions augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
DELIMITER as SEQUENCE_INDEX_DELIMITER,
)
from augur.io.file import open_file
from augur.io.metadata import read_metadata
from augur.io.metadata import InvalidDelimiter, read_metadata
from augur.io.sequences import read_sequences, write_sequences
from augur.io.print import print_err
from augur.io.vcf import is_vcf as filename_is_vcf, write_vcf
Expand Down Expand Up @@ -163,11 +163,19 @@ def run(args):
all_sequences_to_include = set()
filter_counts = defaultdict(int)

metadata_reader = read_metadata(
args.metadata,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
)
try:
metadata_reader = read_metadata(
args.metadata,
args.metadata_delimiters,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
for metadata in metadata_reader:
duplicate_strains = (
set(metadata.index[metadata.index.duplicated()]) |
Expand Down Expand Up @@ -309,6 +317,7 @@ def run(args):
# have passed filters.
metadata_reader = read_metadata(
args.metadata,
args.metadata_delimiters,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
)
Expand Down
16 changes: 13 additions & 3 deletions augur/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from Bio import Phylo, AlignIO
from Bio.Align import MultipleSeqAlignment

from .errors import AugurError
from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies
from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError
from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates
from .io.metadata import read_metadata
from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
from .utils import read_node_data, write_json


Expand All @@ -20,7 +21,9 @@ def register_parser(parent_subparsers):
parser.add_argument('--method', choices=["diffusion", "kde"], required=True,
help="method by which frequencies should be estimated")
parser.add_argument('--metadata', type=str, required=True, metavar="FILE",
help="metadata including dates for given samples, as CSV or TSV")
help="metadata including dates for given samples")
parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
parser.add_argument('--regions', type=str, nargs='+', default=['global'],
help="region to subsample to")
parser.add_argument("--pivot-interval", type=int, default=3,
Expand Down Expand Up @@ -80,7 +83,14 @@ def format_frequencies(freq):


def run(args):
metadata = read_metadata(args.metadata)
try:
metadata = read_metadata(args.metadata, args.metadata_delimiters)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
dates = get_numerical_dates(metadata, fmt='%Y-%m-%d')
stiffness = args.stiffness
inertia = args.inertia
Expand Down
Loading