Merge pull request #812: io: Parse metadata with C engine, restrict t…

…o either CSV or TSV
nextstrain · Mar 31, 2023 · 73aad80 · 73aad80
2 parents 1dacac1 + 9f48ff2
commit 73aad80
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 4 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,19 +2,26 @@
 
 ## __NEXT__
 
+### Major Changes
+
+* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this supported any arbitrary delimiters for the metadata. It is now restricted to CSV and TSV, which are the officially supported formats for all Augur subcommands that use this function. [#812][] (@victorlin)
+
 ### Features
 
 * Constrain `bcbio-gff` to >=0.7.0 and allow `Biopython` >=1.81 again. We had to introduce the `Biopython` constraint in v21.0.1 (see [#1152][]) due to `bcbio-gff` <0.7.0 relying on the removed `Biopython` feature `UnknownSeq`. [#1178][] (@corneliusroemer)
+* `augur.io.read_metadata` (used by export, filter, frequencies, refine, and traits): Previously, this used the Python parser engine for [`pandas.read_csv()`][]. Updated to use the C engine for faster reading of metadata. [#812][] (@victorlin)
 
 ### Bug fixes
 
 * filter, frequencies, refine, parse: Previously, ambiguous dates in the future had a limit of today's date imposed on the upper value but not the lower value. It is now imposed on the lower value as well. [#1171][] (@victorlin)
 * refine: `--year-bounds` was ignored in versions 9.0.0 through 20.0.0. It now works. [#1136][] (@victorlin)
 
+[#812]: https://github.com/nextstrain/augur/pull/812
 [#1136]: https://github.com/nextstrain/augur/issues/1136
 [#1152]: https://github.com/nextstrain/augur/pull/1152
 [#1171]: https://github.com/nextstrain/augur/issues/1171
 [#1178]: https://github.com/nextstrain/augur/pull/1178
+[`pandas.read_csv()`]: https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.read_csv.html
 
 ## 21.1.0 (14 March 2023)
 

diff --git a/augur/io/metadata.py b/augur/io/metadata.py
@@ -12,6 +12,10 @@
 from .file import open_file
 
 
+# List of valid delimiters when reading a metadata file.
+VALID_DELIMITERS = (',', '\t')
+
+
 def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None):
     """Read metadata from a given filename and into a pandas `DataFrame` or
     `TextFileReader` object.
@@ -60,8 +64,8 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
 
     """
     kwargs = {
-        "sep": None,
-        "engine": "python",
+        "sep": _get_delimiter(metadata_file),
+        "engine": "c",
         "skipinitialspace": True,
         "na_filter": False,
     }
@@ -135,7 +139,6 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i
         2. The provided *id_column* does not exist in the *metadata*
         3. The *duplicate_reporting* method is set to ERROR_FIRST or ERROR_ALL and duplicate(s) are found
     """
-    valid_delimiters = [',', '\t']
     seen_ids = set()
     duplicate_ids = set()
     with open_file(table) as handle:
@@ -149,7 +152,9 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i
             handle = chain(table_sample_file, handle)
 
         try:
-            dialect = csv.Sniffer().sniff(table_sample, valid_delimiters)
+            # Note: this sort of duplicates _get_delimiter(), but it's easier if
+            # this is separate since it handles non-seekable buffers.
+            dialect = csv.Sniffer().sniff(table_sample, VALID_DELIMITERS)
         except csv.Error as err:
             raise AugurError(
                 f"Could not determine the delimiter of {table!r}. "
@@ -426,3 +431,16 @@ def write_records_to_tsv(records, output_file):
 
         for record in records:
             tsv_writer.writerow(record)
+
+
+def _get_delimiter(path: str):
+    """Get the delimiter of a file."""
+    with open_file(path) as file:
+        try:
+            # Infer the delimiter from the first line.
+            return csv.Sniffer().sniff(file.readline(), VALID_DELIMITERS).delimiter
+        except csv.Error as err:
+            raise AugurError(
+                f"Could not determine the delimiter of {path!r}. "
+                "File must be a CSV or TSV."
+            ) from err
diff --git a/tests/functional/filter/cram/filter-metadata-delimiter.t b/tests/functional/filter/cram/filter-metadata-delimiter.t
@@ -0,0 +1,34 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Comma-delimited metadata is allowed. However, the output metadata will be tab-delimited.
+
+  $ cat >metadata.txt <<~~
+  > strain,column
+  > SEQ_1,A
+  > SEQ_2,B
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.txt \
+  >  --exclude-where column=A \
+  >  --output-metadata filtered.txt > /dev/null
+  $ cat filtered.txt
+  strain\tcolumn (esc)
+  SEQ_2\tB (esc)
+
+Colon-delimited metadata is not allowed.
+
+  $ cat >metadata.txt <<~~
+  > strain:column
+  > SEQ_1:A
+  > SEQ_2:B
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.txt \
+  >  --exclude-where column=A \
+  >  --output-metadata filtered.txt > /dev/null
+  ERROR: Could not determine the delimiter of 'metadata.txt'. File must be a CSV or TSV.
+  [2]