diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index fce520e6b..ea0196061 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -12,7 +12,7 @@ from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.io.sequences import write_records_to_fasta from augur.types import DataErrorMethod -from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location +from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location, transform_strain_name SUBCOMMAND_ATTRIBUTE = '_curate_subcommand' @@ -25,6 +25,7 @@ apply_record_annotations, abbreviate_authors, parse_genbank_location, + transform_strain_name, ] diff --git a/augur/curate/transform_strain_name.py b/augur/curate/transform_strain_name.py index d86c0e40d..ae128893d 100644 --- a/augur/curate/transform_strain_name.py +++ b/augur/curate/transform_strain_name.py @@ -1,50 +1,78 @@ -#!/usr/bin/env python3 """ -Verifies strain name pattern in the 'strain' field of the NDJSON record from -stdin. Adds a 'strain' field to the record if it does not already exist. - -Outputs the modified records to stdout. +Verifies strain name pattern in the 'strain' field of the NDJSON +record. Adds a 'strain' field to the record if it does not already +exist. """ + import argparse -import json import re -from sys import stderr, stdin, stdout +from typing import Generator, List +from augur.io.print import print_err +from augur.utils import first_line -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--strain-regex", default="^.+$", - help="Regex pattern for strain names. " + - "Strain names that do not match the pattern will be dropped.") - parser.add_argument("--backup-fields", nargs="*", - help="List of backup fields to use as strain name if the value in 'strain' " + - "does not match the strain regex pattern. " + - "If multiple fields are provided, will use the first field that has a non-empty string.") +def transform_name( + record: dict, + index: int, + strain_name_pattern: re.Pattern, + backup_fields: List[str], +) -> dict: + # Verify strain name matches the strain regex pattern + if strain_name_pattern.match(record.get("strain", "")) is None: + # Default to empty string if not matching pattern + record["strain"] = "" - args = parser.parse_args() + # Use non-empty value of backup fields if provided + if backup_fields: + for field in backup_fields: + if record.get(field): + record["strain"] = str(record[field]) + break + + if record["strain"] == "": + print_err( + f"WARNING: Record number {index} has an empty string as the strain name.", + ) + + return record - strain_name_pattern = re.compile(args.strain_regex) - for index, record in enumerate(stdin): - record = json.loads(record) +def register_parser( + parent_subparsers: argparse._SubParsersAction, +) -> argparse._SubParsersAction: + parser = parent_subparsers.add_parser( + "transform-strain-name", + parents=[parent_subparsers.shared_parser], # type: ignore + help=first_line(__doc__), + ) + + parser.add_argument( + "--strain-regex", + default="^.+$", + help="Regex pattern for strain names. " + + "Strain names that do not match the pattern will be dropped.", + ) + parser.add_argument( + "--backup-fields", + nargs="*", + default=[], + help="List of backup fields to use as strain name if the value in 'strain' " + + "does not match the strain regex pattern. " + + "If multiple fields are provided, will use the first field that has a non-empty string.", + ) + + return parser - # Verify strain name matches the strain regex pattern - if strain_name_pattern.match(record.get('strain', '')) is None: - # Default to empty string if not matching pattern - record['strain'] = '' - # Use non-empty value of backup fields if provided - if args.backup_fields: - for field in args.backup_fields: - if record.get(field): - record['strain'] = str(record[field]) - break - if record['strain'] == '': - print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr) +def run(args: argparse.Namespace, records: List[dict]) -> Generator[dict, None, None]: + strain_name_pattern = re.compile(args.strain_regex) + for index, record in enumerate(records): + transform_name( + record, + index, + strain_name_pattern, + args.backup_fields, + ) - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() + yield record diff --git a/tests/functional/curate/cram/transform-strain-name/default-behavior.t b/tests/functional/curate/cram/transform-strain-name/default-behavior.t new file mode 100644 index 000000000..96b85bd9a --- /dev/null +++ b/tests/functional/curate/cram/transform-strain-name/default-behavior.t @@ -0,0 +1,35 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +Running the command with no arguments produces the expected output + + $ echo '{"strain":"OC43"}' \ + > | ${AUGUR} curate transform-strain-name + {"strain": "OC43"} + +Providing a strain regex to the command produces the expected output when the strain matches + + $ echo '{"strain":"OC43"}' \ + > | ${AUGUR} curate transform-strain-name --strain-regex '^\w{2}\d{2}$' + {"strain": "OC43"} + +Providing a strain regex to the command produces an empty field and a warning when the strain doesn't match + + $ echo '{"strain":"OC43"}' \ + > | ${AUGUR} curate transform-strain-name --strain-regex '^\d{2}\w{2}$' + WARNING: Record number 0 has an empty string as the strain name. + {"strain": ""} + +Providing a backup field produces the expected output + + $ echo '{"potential-strain":"OC43"}' \ + > | ${AUGUR} curate transform-strain-name --backup-fields potential-strain + {"potential-strain": "OC43", "strain": "OC43"} + + +Multiple backup fields produce the expected output + + $ echo '{"potential-strain2":"OC43"}' \ + > | ${AUGUR} curate transform-strain-name --backup-fields potential-strain potential-strain2 + {"potential-strain2": "OC43", "strain": "OC43"}