From 6ad8fac1cbe7dba3197585b6867d7785ea77dafe Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 2 Jul 2024 14:20:30 -0700 Subject: [PATCH 1/3] augur curate I/O: validate records have same fields Validate records have the same fields before and after the subcommand modifies the records. Includes a functional test to check for the input error, but there's no good way to check for the output error. Hopefully we just never see it! --- augur/curate/__init__.py | 53 ++++++++++++++++--- .../functional/curate/cram/validate-records.t | 37 +++++++++++++ 2 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 tests/functional/curate/cram/validate-records.t diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index ea0196061..c8aacbbc5 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -5,6 +5,7 @@ import sys from collections import deque from textwrap import dedent +from typing import Iterable, Set from augur.argparse_ import ExtendOverwriteDefault, add_command_subparsers from augur.errors import AugurError @@ -120,6 +121,40 @@ def register_parser(parent_subparsers): return parser +def validate_records(records: Iterable[dict], is_input: bool) -> Iterable[dict]: + """ + Validate that the provided *records* all have the same fields. + Uses the keys of the first record to check against all other records. + + Parameters + ---------- + records: iterable of dict + + is_input: bool + Whether the provided records come directly from user provided input + """ + error_message = "Records do not have the same fields! " + if is_input: + error_message += "Please check your input data has the same fields." + else: + # Hopefully users should not run into this error as it means we are + # not uniformly adding/removing fields from records + error_message += dedent("""\ + Something unexpected happened during the augur curate command. + To report this, please open a new issue including the original command: + + """) + + first_record_keys: Set[str] = set() + for idx, record in enumerate(records): + if idx == 0: + first_record_keys.update(record.keys()) + else: + if set(record.keys()) != first_record_keys: + raise AugurError(error_message) + yield record + + def run(args): # Print help if no subcommands are used if not getattr(args, SUBCOMMAND_ATTRIBUTE, None): @@ -177,25 +212,31 @@ def run(args): input files can be provided via the command line options `--metadata` and `--fasta`. See the command's help message for more details.""")) + # Validate records have the same input fields + validated_input_records = validate_records(records, True) + # Run subcommand to get modified records - modified_records = getattr(args, SUBCOMMAND_ATTRIBUTE).run(args, records) + modified_records = getattr(args, SUBCOMMAND_ATTRIBUTE).run(args, validated_input_records) + + # Validate modified records have the same output fields + validated_output_records = validate_records(modified_records, False) # Output modified records # First output FASTA, since the write fasta function yields the records again # and removes the sequences from the records if args.output_fasta: - modified_records = write_records_to_fasta( - modified_records, + validated_output_records = write_records_to_fasta( + validated_output_records, args.output_fasta, args.output_id_field, args.output_seq_field) if args.output_metadata: - write_records_to_tsv(modified_records, args.output_metadata) + write_records_to_tsv(validated_output_records, args.output_metadata) if not (args.output_fasta or args.output_metadata): - dump_ndjson(modified_records) + dump_ndjson(validated_output_records) else: # Exhaust generator to ensure we run through all records # when only a FASTA output is requested but not a metadata output - deque(modified_records, maxlen=0) + deque(validated_output_records, maxlen=0) diff --git a/tests/functional/curate/cram/validate-records.t b/tests/functional/curate/cram/validate-records.t new file mode 100644 index 000000000..f5b9bdaa2 --- /dev/null +++ b/tests/functional/curate/cram/validate-records.t @@ -0,0 +1,37 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Testing records are validated appropriately by augur curate. +Create NDJSON file for testing validation catches records with different fields. + + $ cat >records.ndjson <<~~ + > {"string": "string_1"} + > {"string": "string_2"} + > {"string": "string_3"} + > {"string": "string_4", "number": 123} + > ~~ + +This will always pass thru the records that pass validation but should raise an +error when it encounters the record with mismatched fields. + + $ cat records.ndjson | ${AUGUR} curate passthru + ERROR: Records do not have the same fields! Please check your input data has the same fields. + {"string": "string_1"} + {"string": "string_2"} + {"string": "string_3"} + [2] + +Passing the records through multiple augur curate commands should raise the +same error when it encounters the record with mismatched fields. + + $ set -o pipefail + $ cat records.ndjson \ + > | ${AUGUR} curate passthru \ + > | ${AUGUR} curate passthru \ + > | ${AUGUR} curate passthru + ERROR: Records do not have the same fields! Please check your input data has the same fields. + {"string": "string_1"} + {"string": "string_2"} + {"string": "string_3"} + [2] From 9811491b5a8aa5c23ac758a5b8f073d1b00c61c5 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 3 Jul 2024 12:20:09 -0700 Subject: [PATCH 2/3] Update changelog --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 64ce16f43..d42eb8912 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ ### Major changes * curate format-dates: Raises an error if provided date field does not exist in records. [#1509][] (@joverlee521) +* All curate subcommands: Verifies all input records have the same fields and raises an error if a record does not have matching fields. [#1518][] (@joverlee521) ### Features @@ -29,6 +30,7 @@ [#1495]: https://github.com/nextstrain/augur/pull/1495 [#1501]: https://github.com/nextstrain/augur/pull/1501 [#1509]: https://github.com/nextstrain/augur/pull/1509 +[#1518]: https://github.com/nextstrain/augur/pull/1518 ## 24.4.0 (15 May 2024) From 6e35dd485422bd5039c4efebfbcdc0ea1c6501eb Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 3 Jul 2024 12:25:31 -0700 Subject: [PATCH 3/3] Fix changelog link Follow up to https://github.com/nextstrain/augur/pull/1514 --- CHANGES.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index d42eb8912..4f5ad277f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,7 +15,7 @@ * Added a new sub-command `augur curate abbreviate-authors` to abbreviate lists of authors to " et al." Previously, this was avaliable as the `transform-authors` script within the nextstrain/ingest repo. [#1483][] (@genehack) * Added a new sub-command `augur curate parse-genbank-location` to parse the `geo_loc_name` field from GenBank reconds. Previously, this was available as the `translate-genbank-location` script within the nextstrain/ingest repo. [#1485][] (@genehack) * curate format-dates: Added defaults to `--expected-date-formats` so that ISO 8601 dates (`%Y-%m-%d`) and its various masked forms (e.g. `%Y-XX-XX`) are automatically parsed by the command. [#1501][] (@joverlee521) -* Added a new sub-command `augur curate translate-strain-name` to filter strain names based on matching a regular expression. Previously, this was available as the `translate-strain-names` script within the nextstrain/ingest repo. [#1486][] (@genehack) +* Added a new sub-command `augur curate translate-strain-name` to filter strain names based on matching a regular expression. Previously, this was available as the `translate-strain-names` script within the nextstrain/ingest repo. [#1514][] (@genehack) ### Bug Fixes @@ -30,6 +30,7 @@ [#1495]: https://github.com/nextstrain/augur/pull/1495 [#1501]: https://github.com/nextstrain/augur/pull/1501 [#1509]: https://github.com/nextstrain/augur/pull/1509 +[#1514]: https://github.com/nextstrain/augur/pull/1514 [#1518]: https://github.com/nextstrain/augur/pull/1518 ## 24.4.0 (15 May 2024)