From 6ad8fac1cbe7dba3197585b6867d7785ea77dafe Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 2 Jul 2024 14:20:30 -0700
Subject: [PATCH 1/3] augur curate I/O: validate records have same fields

Validate records have the same fields before and after the subcommand
modifies the records.

Includes a functional test to check for the input error, but there's no
good way to check for the output error. Hopefully we just never see it!
---
 augur/curate/__init__.py                      | 53 ++++++++++++++++---
 .../functional/curate/cram/validate-records.t | 37 +++++++++++++
 2 files changed, 84 insertions(+), 6 deletions(-)
 create mode 100644 tests/functional/curate/cram/validate-records.t

diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
index ea0196061..c8aacbbc5 100644
--- a/augur/curate/__init__.py
+++ b/augur/curate/__init__.py
@@ -5,6 +5,7 @@
 import sys
 from collections import deque
 from textwrap import dedent
+from typing import Iterable, Set
 
 from augur.argparse_ import ExtendOverwriteDefault, add_command_subparsers
 from augur.errors import AugurError
@@ -120,6 +121,40 @@ def register_parser(parent_subparsers):
     return parser
 
 
+def validate_records(records: Iterable[dict], is_input: bool) -> Iterable[dict]:
+    """
+    Validate that the provided *records* all have the same fields.
+    Uses the keys of the first record to check against all other records.
+
+    Parameters
+    ----------
+    records: iterable of dict
+
+    is_input: bool
+        Whether the provided records come directly from user provided input
+    """
+    error_message = "Records do not have the same fields! "
+    if is_input:
+        error_message += "Please check your input data has the same fields."
+    else:
+        # Hopefully users should not run into this error as it means we are
+        # not uniformly adding/removing fields from records
+        error_message += dedent("""\
+            Something unexpected happened during the augur curate command.
+            To report this, please open a new issue including the original command:
+                <https://github.com/nextstrain/augur/issues/new/choose>
+            """)
+
+    first_record_keys: Set[str] = set()
+    for idx, record in enumerate(records):
+        if idx == 0:
+            first_record_keys.update(record.keys())
+        else:
+            if set(record.keys()) != first_record_keys:
+                raise AugurError(error_message)
+        yield record
+
+
 def run(args):
     # Print help if no subcommands are used
     if not getattr(args, SUBCOMMAND_ATTRIBUTE, None):
@@ -177,25 +212,31 @@ def run(args):
             input files can be provided via the command line options `--metadata` and `--fasta`.
             See the command's help message for more details."""))
 
+    # Validate records have the same input fields
+    validated_input_records = validate_records(records, True)
+
     # Run subcommand to get modified records
-    modified_records = getattr(args, SUBCOMMAND_ATTRIBUTE).run(args, records)
+    modified_records = getattr(args, SUBCOMMAND_ATTRIBUTE).run(args, validated_input_records)
+
+    # Validate modified records have the same output fields
+    validated_output_records = validate_records(modified_records, False)
 
     # Output modified records
     # First output FASTA, since the write fasta function yields the records again
     # and removes the sequences from the records
     if args.output_fasta:
-        modified_records = write_records_to_fasta(
-            modified_records,
+        validated_output_records = write_records_to_fasta(
+            validated_output_records,
             args.output_fasta,
             args.output_id_field,
             args.output_seq_field)
 
     if args.output_metadata:
-        write_records_to_tsv(modified_records, args.output_metadata)
+        write_records_to_tsv(validated_output_records, args.output_metadata)
 
     if not (args.output_fasta or args.output_metadata):
-        dump_ndjson(modified_records)
+        dump_ndjson(validated_output_records)
     else:
         # Exhaust generator to ensure we run through all records
         # when only a FASTA output is requested but not a metadata output
-        deque(modified_records, maxlen=0)
+        deque(validated_output_records, maxlen=0)
diff --git a/tests/functional/curate/cram/validate-records.t b/tests/functional/curate/cram/validate-records.t
new file mode 100644
index 000000000..f5b9bdaa2
--- /dev/null
+++ b/tests/functional/curate/cram/validate-records.t
@@ -0,0 +1,37 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Testing records are validated appropriately by augur curate.
+Create NDJSON file for testing validation catches records with different fields.
+
+  $ cat >records.ndjson <<~~
+  > {"string": "string_1"}
+  > {"string": "string_2"}
+  > {"string": "string_3"}
+  > {"string": "string_4", "number": 123}
+  > ~~
+
+This will always pass thru the records that pass validation but should raise an
+error when it encounters the record with mismatched fields.
+
+  $ cat records.ndjson | ${AUGUR} curate passthru
+  ERROR: Records do not have the same fields! Please check your input data has the same fields.
+  {"string": "string_1"}
+  {"string": "string_2"}
+  {"string": "string_3"}
+  [2]
+
+Passing the records through multiple augur curate commands should raise the
+same error when it encounters the record with mismatched fields.
+
+  $ set -o pipefail
+  $ cat records.ndjson \
+  >   | ${AUGUR} curate passthru \
+  >   | ${AUGUR} curate passthru \
+  >   | ${AUGUR} curate passthru
+  ERROR: Records do not have the same fields! Please check your input data has the same fields.
+  {"string": "string_1"}
+  {"string": "string_2"}
+  {"string": "string_3"}
+  [2]

From 9811491b5a8aa5c23ac758a5b8f073d1b00c61c5 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Wed, 3 Jul 2024 12:20:09 -0700
Subject: [PATCH 2/3] Update changelog

---
 CHANGES.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index 64ce16f43..d42eb8912 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -5,6 +5,7 @@
 ### Major changes
 
 * curate format-dates: Raises an error if provided date field does not exist in records. [#1509][] (@joverlee521)
+* All curate subcommands: Verifies all input records have the same fields and raises an error if a record does not have matching fields. [#1518][] (@joverlee521)
 
 ### Features
 
@@ -29,6 +30,7 @@
 [#1495]: https://github.com/nextstrain/augur/pull/1495
 [#1501]: https://github.com/nextstrain/augur/pull/1501
 [#1509]: https://github.com/nextstrain/augur/pull/1509
+[#1518]: https://github.com/nextstrain/augur/pull/1518
 
 ## 24.4.0 (15 May 2024)
 

From 6e35dd485422bd5039c4efebfbcdc0ea1c6501eb Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Wed, 3 Jul 2024 12:25:31 -0700
Subject: [PATCH 3/3] Fix changelog link

Follow up to https://github.com/nextstrain/augur/pull/1514
---
 CHANGES.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index d42eb8912..4f5ad277f 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -15,7 +15,7 @@
 * Added a new sub-command `augur curate abbreviate-authors` to abbreviate lists of authors to "<first author> et al." Previously, this was avaliable as the `transform-authors` script within the nextstrain/ingest repo. [#1483][] (@genehack)
 * Added a new sub-command `augur curate parse-genbank-location` to parse the `geo_loc_name` field from GenBank reconds. Previously, this was available as the `translate-genbank-location` script within the nextstrain/ingest repo. [#1485][] (@genehack)
 * curate format-dates: Added defaults to `--expected-date-formats` so that ISO 8601 dates (`%Y-%m-%d`) and its various masked forms (e.g. `%Y-XX-XX`) are automatically parsed by the command. [#1501][] (@joverlee521)
-* Added a new sub-command `augur curate translate-strain-name` to filter strain names based on matching a regular expression. Previously, this was available as the `translate-strain-names` script within the nextstrain/ingest repo. [#1486][] (@genehack)
+* Added a new sub-command `augur curate translate-strain-name` to filter strain names based on matching a regular expression. Previously, this was available as the `translate-strain-names` script within the nextstrain/ingest repo. [#1514][] (@genehack)
 
 ### Bug Fixes
 
@@ -30,6 +30,7 @@
 [#1495]: https://github.com/nextstrain/augur/pull/1495
 [#1501]: https://github.com/nextstrain/augur/pull/1501
 [#1509]: https://github.com/nextstrain/augur/pull/1509
+[#1514]: https://github.com/nextstrain/augur/pull/1514
 [#1518]: https://github.com/nextstrain/augur/pull/1518
 
 ## 24.4.0 (15 May 2024)