diff --git a/CHANGES.md b/CHANGES.md index bb53b76f3..79f52910e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,19 +2,26 @@ ## __NEXT__ +### Major Changes + +* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this supported any arbitrary delimiters for the metadata. It is now restricted to CSV and TSV, which are the officially supported formats for all Augur subcommands that use this function. [#812][] (@victorlin) + ### Features * Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer) +* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin) ### Bug fixes * filter, frequencies, refine, parse: Previously, ambiguous dates in the future had a limit of today's date imposed on the upper value but not the lower value. It is now imposed on the lower value as well. [#1171][] (@victorlin) * refine: `--year-bounds` was ignored in versions 9.0.0 through 20.0.0. It now works. [#1136][] (@victorlin) +[#812]: https://github.com/nextstrain/augur/pull/812 [#1136]: https://github.com/nextstrain/augur/issues/1136 [#1152]: https://github.com/nextstrain/augur/pull/1152 [#1171]: https://github.com/nextstrain/augur/issues/1171 [#1178]: https://github.com/nextstrain/augur/pull/1178 +[`pandas.read_csv()`]: https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.read_csv.html ## 21.1.0 (14 March 2023) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 79feb723e..5dea0eabf 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -12,6 +12,10 @@ from .file import open_file +# List of valid delimiters when reading a metadata file. +VALID_DELIMITERS = (',', '\t') + + def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None): """Read metadata from a given filename and into a pandas `DataFrame` or `TextFileReader` object. @@ -60,8 +64,8 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None) """ kwargs = { - "sep": None, - "engine": "python", + "sep": _get_delimiter(metadata_file), + "engine": "c", "skipinitialspace": True, "na_filter": False, } @@ -135,7 +139,6 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i 2. The provided *id_column* does not exist in the *metadata* 3. The *duplicate_reporting* method is set to ERROR_FIRST or ERROR_ALL and duplicate(s) are found """ - valid_delimiters = [',', '\t'] seen_ids = set() duplicate_ids = set() with open_file(table) as handle: @@ -149,7 +152,9 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i handle = chain(table_sample_file, handle) try: - dialect = csv.Sniffer().sniff(table_sample, valid_delimiters) + # Note: this sort of duplicates _get_delimiter(), but it's easier if + # this is separate since it handles non-seekable buffers. + dialect = csv.Sniffer().sniff(table_sample, VALID_DELIMITERS) except csv.Error as err: raise AugurError( f"Could not determine the delimiter of {table!r}. " @@ -426,3 +431,16 @@ def write_records_to_tsv(records, output_file): for record in records: tsv_writer.writerow(record) + + +def _get_delimiter(path: str): + """Get the delimiter of a file.""" + with open_file(path) as file: + try: + # Infer the delimiter from the first line. + return csv.Sniffer().sniff(file.readline(), VALID_DELIMITERS).delimiter + except csv.Error as err: + raise AugurError( + f"Could not determine the delimiter of {path!r}. " + "File must be a CSV or TSV." + ) from err diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t new file mode 100644 index 000000000..ffce8207b --- /dev/null +++ b/tests/functional/filter/cram/filter-metadata-delimiter.t @@ -0,0 +1,34 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Comma-delimited metadata is allowed. However, the output metadata will be tab-delimited. + + $ cat >metadata.txt <<~~ + > strain,column + > SEQ_1,A + > SEQ_2,B + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.txt \ + > --exclude-where column=A \ + > --output-metadata filtered.txt > /dev/null + $ cat filtered.txt + strain\tcolumn (esc) + SEQ_2\tB (esc) + +Colon-delimited metadata is not allowed. + + $ cat >metadata.txt <<~~ + > strain:column + > SEQ_1:A + > SEQ_2:B + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.txt \ + > --exclude-where column=A \ + > --output-metadata filtered.txt > /dev/null + ERROR: Could not determine the delimiter of 'metadata.txt'. File must be a CSV or TSV. + [2]