From c0b13b40f02d3c7377bae30356e0a790667ad5df Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 29 Mar 2023 14:47:13 -0700 Subject: [PATCH 1/5] filter: Add test to show existing support for metadata delimiters --- .../filter/cram/filter-metadata-delimiter.t | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/functional/filter/cram/filter-metadata-delimiter.t diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t new file mode 100644 index 000000000..c7443ee9f --- /dev/null +++ b/tests/functional/filter/cram/filter-metadata-delimiter.t @@ -0,0 +1,35 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Comma-delimited metadata is allowed. However, the output metadata will be tab-delimited. + + $ cat >metadata.txt <<~~ + > strain,column + > SEQ_1,A + > SEQ_2,B + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.txt \ + > --exclude-where column=A \ + > --output-metadata filtered.txt > /dev/null + $ cat filtered.txt + strain\tcolumn (esc) + SEQ_2\tB (esc) + +Colon-delimited metadata is allowed. However, the output metadata will be tab-delimited. + + $ cat >metadata.txt <<~~ + > strain:column + > SEQ_1:A + > SEQ_2:B + > ~~ + + $ ${AUGUR} filter \ + > --metadata metadata.txt \ + > --exclude-where column=A \ + > --output-metadata filtered.txt > /dev/null + $ cat filtered.txt + strain\tcolumn (esc) + SEQ_2\tB (esc) From 74e7fe91949c546e0878b26013764c38c50f6a72 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 29 Mar 2023 14:29:22 -0700 Subject: [PATCH 2/5] read_metadata: Restrict possible delimiters when reading Previously, the delimiter could be anything arbitrary. However, all Augur subcommands that use this function only advertise compatibility with CSV and TSV. I don't think there's a good reason to support arbitrary delimiters. --- augur/io/metadata.py | 17 ++++++++++++++++- .../filter/cram/filter-metadata-delimiter.t | 7 +++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 79feb723e..c41981d50 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -60,7 +60,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None) """ kwargs = { - "sep": None, + "sep": _get_delimiter(metadata_file), "engine": "python", "skipinitialspace": True, "na_filter": False, @@ -149,6 +149,8 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i handle = chain(table_sample_file, handle) try: + # Note: this sort of duplicates _get_delimiter(), but it's easier if + # this is separate since it handles non-seekable buffers. dialect = csv.Sniffer().sniff(table_sample, valid_delimiters) except csv.Error as err: raise AugurError( @@ -426,3 +428,16 @@ def write_records_to_tsv(records, output_file): for record in records: tsv_writer.writerow(record) + + +def _get_delimiter(path: str): + """Get the delimiter of a file.""" + with open_file(path) as file: + try: + # Infer the delimiter from the first line. + return csv.Sniffer().sniff(file.readline(), delimiters=[',', '\t']).delimiter + except csv.Error as err: + raise AugurError( + f"Could not determine the delimiter of {path!r}. " + "File must be a CSV or TSV." + ) from err diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t index c7443ee9f..ffce8207b 100644 --- a/tests/functional/filter/cram/filter-metadata-delimiter.t +++ b/tests/functional/filter/cram/filter-metadata-delimiter.t @@ -18,7 +18,7 @@ Comma-delimited metadata is allowed. However, the output metadata will be tab-de strain\tcolumn (esc) SEQ_2\tB (esc) -Colon-delimited metadata is allowed. However, the output metadata will be tab-delimited. +Colon-delimited metadata is not allowed. $ cat >metadata.txt <<~~ > strain:column @@ -30,6 +30,5 @@ Colon-delimited metadata is allowed. However, the output metadata will be tab-de > --metadata metadata.txt \ > --exclude-where column=A \ > --output-metadata filtered.txt > /dev/null - $ cat filtered.txt - strain\tcolumn (esc) - SEQ_2\tB (esc) + ERROR: Could not determine the delimiter of 'metadata.txt'. File must be a CSV or TSV. + [2] From a90f0a5a1a8cd0f180bb9e988cd14180a60253f0 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 29 Mar 2023 15:11:52 -0700 Subject: [PATCH 3/5] read_metadata: Use the C engine for pandas.read_csv() The python engine was only used to detect the delimiter. Now that the delimiter is detected separately, use the C engine since it is faster. --- augur/io/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index c41981d50..86a2ce217 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -61,7 +61,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None) """ kwargs = { "sep": _get_delimiter(metadata_file), - "engine": "python", + "engine": "c", "skipinitialspace": True, "na_filter": False, } From 04489f704eb653167952529f7faaf7dba7ff32be Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 29 Mar 2023 15:52:33 -0700 Subject: [PATCH 4/5] Update changelog --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index bb53b76f3..79f52910e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,19 +2,26 @@ ## __NEXT__ +### Major Changes + +* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this supported any arbitrary delimiters for the metadata. It is now restricted to CSV and TSV, which are the officially supported formats for all Augur subcommands that use this function. [#812][] (@victorlin) + ### Features * Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer) +* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin) ### Bug fixes * filter, frequencies, refine, parse: Previously, ambiguous dates in the future had a limit of today's date imposed on the upper value but not the lower value. It is now imposed on the lower value as well. [#1171][] (@victorlin) * refine: `--year-bounds` was ignored in versions 9.0.0 through 20.0.0. It now works. [#1136][] (@victorlin) +[#812]: https://github.com/nextstrain/augur/pull/812 [#1136]: https://github.com/nextstrain/augur/issues/1136 [#1152]: https://github.com/nextstrain/augur/pull/1152 [#1171]: https://github.com/nextstrain/augur/issues/1171 [#1178]: https://github.com/nextstrain/augur/pull/1178 +[`pandas.read_csv()`]: https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.read_csv.html ## 21.1.0 (14 March 2023) From 9f48ff2fd39c93456e75b08f7bac8e6d91b0c0cb Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 31 Mar 2023 10:05:53 -0700 Subject: [PATCH 5/5] Use constant for valid delimiters Avoids re-defining this list at each use case and prevents them from getting out of sync. --- augur/io/metadata.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 86a2ce217..5dea0eabf 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -12,6 +12,10 @@ from .file import open_file +# List of valid delimiters when reading a metadata file. +VALID_DELIMITERS = (',', '\t') + + def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None): """Read metadata from a given filename and into a pandas `DataFrame` or `TextFileReader` object. @@ -135,7 +139,6 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i 2. The provided *id_column* does not exist in the *metadata* 3. The *duplicate_reporting* method is set to ERROR_FIRST or ERROR_ALL and duplicate(s) are found """ - valid_delimiters = [',', '\t'] seen_ids = set() duplicate_ids = set() with open_file(table) as handle: @@ -151,7 +154,7 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i try: # Note: this sort of duplicates _get_delimiter(), but it's easier if # this is separate since it handles non-seekable buffers. - dialect = csv.Sniffer().sniff(table_sample, valid_delimiters) + dialect = csv.Sniffer().sniff(table_sample, VALID_DELIMITERS) except csv.Error as err: raise AugurError( f"Could not determine the delimiter of {table!r}. " @@ -435,7 +438,7 @@ def _get_delimiter(path: str): with open_file(path) as file: try: # Infer the delimiter from the first line. - return csv.Sniffer().sniff(file.readline(), delimiters=[',', '\t']).delimiter + return csv.Sniffer().sniff(file.readline(), VALID_DELIMITERS).delimiter except csv.Error as err: raise AugurError( f"Could not determine the delimiter of {path!r}. "