From da845f43df12be4dab149bc0e6851e92c142e20c Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 9 Dec 2022 18:50:43 -0800 Subject: [PATCH 1/3] tests: Failing test for comma separated data in curate metadata TSV Add an additional "authors" field with comma separated values to the testing metadata TSV file, showing how this causes the csv sniffer to fail to identify the delimiter for the file. --- tests/functional/curate/cram/metadata-input.t | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/functional/curate/cram/metadata-input.t b/tests/functional/curate/cram/metadata-input.t index 34a4b10ab..b08ec828b 100644 --- a/tests/functional/curate/cram/metadata-input.t +++ b/tests/functional/curate/cram/metadata-input.t @@ -9,28 +9,28 @@ Running the `passthru` subcommand since it does not do any data transformations. Create metadata TSV file for testing. $ cat >$TMP/metadata.tsv <<~~ - > strain country date - > sequence_A USA 2020-10-01 - > sequence_B USA 2020-10-02 - > sequence_C USA 2020-10-03 + > strain country date authors + > sequence_A USA 2020-10-01 A,B,C,D,E,F,G,H,I,J,K + > sequence_B USA 2020-10-02 A,B,C,D,E,F,G,H,I,J,K + > sequence_C USA 2020-10-03 A,B,C,D,E,F,G,H,I,J,K > ~~ Test TSV metadata input $ ${AUGUR} curate passthru \ > --metadata $TMP/metadata.tsv - {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"} - {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"} - {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"} + {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} Test TSV metadata input from stdin $ cat $TMP/metadata.tsv \ > | ${AUGUR} curate normalize-strings \ > --metadata - - {"strain": "sequence_A", "country": "USA", "date": "2020-10-01"} - {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"} - {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"} + {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} Create metadata CSV file for testing. From 04f1e4ce02dbf45248ccafd726341b4154c66802 Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 9 Dec 2022 18:55:27 -0800 Subject: [PATCH 2/3] read_table_to_dict: only sample the first line Only use the first line or header of the CSV/TSV file to determine the delimiter of the file. This prevents csv.Sniffer from failing to determine the delimiter when the data values include commas or tabs. --- augur/io/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 7fbf3c6e4..c4ee27c62 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -138,7 +138,7 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i duplicate_ids = set() with open_file(table) as handle: # Get sample to determine delimiter - table_sample = handle.read(1024) + table_sample = handle.readline() if handle.seekable(): handle.seek(0) From 0f740f3f5dcf6e62687276e45a534e1538a42399 Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 9 Dec 2022 19:08:11 -0800 Subject: [PATCH 3/3] Update changelog --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index f8c6bbbed..3897d6183 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -7,10 +7,12 @@ * docs: Update the API documentation to reflect the latest state of things in the codebase. [#1087][] (@victorlin) * Fix support for Biopython version 1.80 which deprecated `Bio.Seq.Seq.ungap()`. [#1102][] (@victorlin) * export v2: Fixed a bug where colorings for zero values via `--colors` would not get applied to the exported Auspice JSON. [#1100][] (@joverlee521) +* curate: Fixed a bug where metadata TSVs failed to parse if data within a column included comma separated values [#1110][] (@joverlee521) [#1087]: https://github.com/nextstrain/augur/pull/1087 [#1100]: https://github.com/nextstrain/augur/pull/1100 [#1102]: https://github.com/nextstrain/augur/pull/1102 +[#1110]: https://github.com/nextstrain/augur/pull/1110 ## 18.2.0 (15 November 2022)