From c0b13b40f02d3c7377bae30356e0a790667ad5df Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Wed, 29 Mar 2023 14:47:13 -0700
Subject: [PATCH 1/5] filter: Add test to show existing support for metadata
 delimiters

---
 .../filter/cram/filter-metadata-delimiter.t   | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 tests/functional/filter/cram/filter-metadata-delimiter.t

diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t
new file mode 100644
index 000000000..c7443ee9f
--- /dev/null
+++ b/tests/functional/filter/cram/filter-metadata-delimiter.t
@@ -0,0 +1,35 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Comma-delimited metadata is allowed. However, the output metadata will be tab-delimited.
+
+  $ cat >metadata.txt <<~~
+  > strain,column
+  > SEQ_1,A
+  > SEQ_2,B
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.txt \
+  >  --exclude-where column=A \
+  >  --output-metadata filtered.txt > /dev/null
+  $ cat filtered.txt
+  strain\tcolumn (esc)
+  SEQ_2\tB (esc)
+
+Colon-delimited metadata is allowed. However, the output metadata will be tab-delimited.
+
+  $ cat >metadata.txt <<~~
+  > strain:column
+  > SEQ_1:A
+  > SEQ_2:B
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.txt \
+  >  --exclude-where column=A \
+  >  --output-metadata filtered.txt > /dev/null
+  $ cat filtered.txt
+  strain\tcolumn (esc)
+  SEQ_2\tB (esc)

From 74e7fe91949c546e0878b26013764c38c50f6a72 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Wed, 29 Mar 2023 14:29:22 -0700
Subject: [PATCH 2/5] read_metadata: Restrict possible delimiters when reading

Previously, the delimiter could be anything arbitrary. However, all
Augur subcommands that use this function only advertise compatibility
with CSV and TSV. I don't think there's a good reason to support
arbitrary delimiters.
---
 augur/io/metadata.py                            | 17 ++++++++++++++++-
 .../filter/cram/filter-metadata-delimiter.t     |  7 +++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/augur/io/metadata.py b/augur/io/metadata.py
index 79feb723e..c41981d50 100644
--- a/augur/io/metadata.py
+++ b/augur/io/metadata.py
@@ -60,7 +60,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
 
     """
     kwargs = {
-        "sep": None,
+        "sep": _get_delimiter(metadata_file),
         "engine": "python",
         "skipinitialspace": True,
         "na_filter": False,
@@ -149,6 +149,8 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i
             handle = chain(table_sample_file, handle)
 
         try:
+            # Note: this sort of duplicates _get_delimiter(), but it's easier if
+            # this is separate since it handles non-seekable buffers.
             dialect = csv.Sniffer().sniff(table_sample, valid_delimiters)
         except csv.Error as err:
             raise AugurError(
@@ -426,3 +428,16 @@ def write_records_to_tsv(records, output_file):
 
         for record in records:
             tsv_writer.writerow(record)
+
+
+def _get_delimiter(path: str):
+    """Get the delimiter of a file."""
+    with open_file(path) as file:
+        try:
+            # Infer the delimiter from the first line.
+            return csv.Sniffer().sniff(file.readline(), delimiters=[',', '\t']).delimiter
+        except csv.Error as err:
+            raise AugurError(
+                f"Could not determine the delimiter of {path!r}. "
+                "File must be a CSV or TSV."
+            ) from err
diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t
index c7443ee9f..ffce8207b 100644
--- a/tests/functional/filter/cram/filter-metadata-delimiter.t
+++ b/tests/functional/filter/cram/filter-metadata-delimiter.t
@@ -18,7 +18,7 @@ Comma-delimited metadata is allowed. However, the output metadata will be tab-de
   strain\tcolumn (esc)
   SEQ_2\tB (esc)
 
-Colon-delimited metadata is allowed. However, the output metadata will be tab-delimited.
+Colon-delimited metadata is not allowed.
 
   $ cat >metadata.txt <<~~
   > strain:column
@@ -30,6 +30,5 @@ Colon-delimited metadata is allowed. However, the output metadata will be tab-de
   >  --metadata metadata.txt \
   >  --exclude-where column=A \
   >  --output-metadata filtered.txt > /dev/null
-  $ cat filtered.txt
-  strain\tcolumn (esc)
-  SEQ_2\tB (esc)
+  ERROR: Could not determine the delimiter of 'metadata.txt'. File must be a CSV or TSV.
+  [2]

From a90f0a5a1a8cd0f180bb9e988cd14180a60253f0 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Wed, 29 Mar 2023 15:11:52 -0700
Subject: [PATCH 3/5] read_metadata: Use the C engine for pandas.read_csv()

The python engine was only used to detect the delimiter. Now that the
delimiter is detected separately, use the C engine since it is faster.
---
 augur/io/metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/augur/io/metadata.py b/augur/io/metadata.py
index c41981d50..86a2ce217 100644
--- a/augur/io/metadata.py
+++ b/augur/io/metadata.py
@@ -61,7 +61,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
     """
     kwargs = {
         "sep": _get_delimiter(metadata_file),
-        "engine": "python",
+        "engine": "c",
         "skipinitialspace": True,
         "na_filter": False,
     }

From 04489f704eb653167952529f7faaf7dba7ff32be Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Wed, 29 Mar 2023 15:52:33 -0700
Subject: [PATCH 4/5] Update changelog

---
 CHANGES.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index bb53b76f3..79f52910e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,19 +2,26 @@
 
 ## __NEXT__
 
+### Major Changes
+
+* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this supported any arbitrary delimiters for the metadata. It is now restricted to CSV and TSV, which are the officially supported formats for all Augur subcommands that use this function. [#812][] (@victorlin)
+
 ### Features
 
 * Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer)
+* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin)
 
 ### Bug fixes
 
 * filter, frequencies, refine, parse: Previously, ambiguous dates in the future had a limit of today's date imposed on the upper value but not the lower value. It is now imposed on the lower value as well. [#1171][] (@victorlin)
 * refine: `--year-bounds` was ignored in versions 9.0.0 through 20.0.0. It now works. [#1136][] (@victorlin)
 
+[#812]: https://github.com/nextstrain/augur/pull/812
 [#1136]: https://github.com/nextstrain/augur/issues/1136
 [#1152]: https://github.com/nextstrain/augur/pull/1152
 [#1171]: https://github.com/nextstrain/augur/issues/1171
 [#1178]: https://github.com/nextstrain/augur/pull/1178
+[`pandas.read_csv()`]: https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.read_csv.html
 
 ## 21.1.0 (14 March 2023)
 

From 9f48ff2fd39c93456e75b08f7bac8e6d91b0c0cb Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 31 Mar 2023 10:05:53 -0700
Subject: [PATCH 5/5] Use constant for valid delimiters

Avoids re-defining this list at each use case and prevents them from
getting out of sync.
---
 augur/io/metadata.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/augur/io/metadata.py b/augur/io/metadata.py
index 86a2ce217..5dea0eabf 100644
--- a/augur/io/metadata.py
+++ b/augur/io/metadata.py
@@ -12,6 +12,10 @@
 from .file import open_file
 
 
+# List of valid delimiters when reading a metadata file.
+VALID_DELIMITERS = (',', '\t')
+
+
 def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None):
     """Read metadata from a given filename and into a pandas `DataFrame` or
     `TextFileReader` object.
@@ -135,7 +139,6 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i
         2. The provided *id_column* does not exist in the *metadata*
         3. The *duplicate_reporting* method is set to ERROR_FIRST or ERROR_ALL and duplicate(s) are found
     """
-    valid_delimiters = [',', '\t']
     seen_ids = set()
     duplicate_ids = set()
     with open_file(table) as handle:
@@ -151,7 +154,7 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i
         try:
             # Note: this sort of duplicates _get_delimiter(), but it's easier if
             # this is separate since it handles non-seekable buffers.
-            dialect = csv.Sniffer().sniff(table_sample, valid_delimiters)
+            dialect = csv.Sniffer().sniff(table_sample, VALID_DELIMITERS)
         except csv.Error as err:
             raise AugurError(
                 f"Could not determine the delimiter of {table!r}. "
@@ -435,7 +438,7 @@ def _get_delimiter(path: str):
     with open_file(path) as file:
         try:
             # Infer the delimiter from the first line.
-            return csv.Sniffer().sniff(file.readline(), delimiters=[',', '\t']).delimiter
+            return csv.Sniffer().sniff(file.readline(), VALID_DELIMITERS).delimiter
         except csv.Error as err:
             raise AugurError(
                 f"Could not determine the delimiter of {path!r}. "