Skip to content

Commit

Permalink
Port translate-strain-name into augur curate style [#1486]
Browse files Browse the repository at this point in the history
* Convert script over to expected sub-cmd style
* Add type hints throughout
* Add tests
  • Loading branch information
genehack committed Jul 2, 2024
1 parent 90ac28b commit ff2faf9
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 38 deletions.
3 changes: 2 additions & 1 deletion augur/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
from augur.io.sequences import write_records_to_fasta
from augur.types import DataErrorMethod
from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location
from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location, transform_strain_name


SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
Expand All @@ -25,6 +25,7 @@
apply_record_annotations,
abbreviate_authors,
parse_genbank_location,
transform_strain_name,
]


Expand Down
102 changes: 65 additions & 37 deletions augur/curate/transform_strain_name.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,78 @@
#!/usr/bin/env python3
"""
Verifies strain name pattern in the 'strain' field of the NDJSON record from
stdin. Adds a 'strain' field to the record if it does not already exist.
Outputs the modified records to stdout.
Verifies strain name pattern in the 'strain' field of the NDJSON
record. Adds a 'strain' field to the record if it does not already
exist.
"""

import argparse
import json
import re
from sys import stderr, stdin, stdout
from typing import Generator, List
from augur.io.print import print_err
from augur.utils import first_line


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--strain-regex", default="^.+$",
help="Regex pattern for strain names. " +
"Strain names that do not match the pattern will be dropped.")
parser.add_argument("--backup-fields", nargs="*",
help="List of backup fields to use as strain name if the value in 'strain' " +
"does not match the strain regex pattern. " +
"If multiple fields are provided, will use the first field that has a non-empty string.")
def transform_name(
record: dict,
index: int,
strain_name_pattern: re.Pattern,
backup_fields: List[str],
) -> dict:
# Verify strain name matches the strain regex pattern
if strain_name_pattern.match(record.get("strain", "")) is None:
# Default to empty string if not matching pattern
record["strain"] = ""

args = parser.parse_args()
# Use non-empty value of backup fields if provided
if backup_fields:
for field in backup_fields:
if record.get(field):
record["strain"] = str(record[field])
break

if record["strain"] == "":
print_err(
f"WARNING: Record number {index} has an empty string as the strain name.",
)

return record

strain_name_pattern = re.compile(args.strain_regex)

for index, record in enumerate(stdin):
record = json.loads(record)
def register_parser(
parent_subparsers: argparse._SubParsersAction,
) -> argparse._SubParsersAction:
parser = parent_subparsers.add_parser(
"transform-strain-name",
parents=[parent_subparsers.shared_parser], # type: ignore
help=first_line(__doc__),
)

parser.add_argument(
"--strain-regex",
default="^.+$",
help="Regex pattern for strain names. "
+ "Strain names that do not match the pattern will be dropped.",
)
parser.add_argument(
"--backup-fields",
nargs="*",
default=[],
help="List of backup fields to use as strain name if the value in 'strain' "
+ "does not match the strain regex pattern. "
+ "If multiple fields are provided, will use the first field that has a non-empty string.",
)

return parser

# Verify strain name matches the strain regex pattern
if strain_name_pattern.match(record.get('strain', '')) is None:
# Default to empty string if not matching pattern
record['strain'] = ''
# Use non-empty value of backup fields if provided
if args.backup_fields:
for field in args.backup_fields:
if record.get(field):
record['strain'] = str(record[field])
break

if record['strain'] == '':
print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr)
def run(args: argparse.Namespace, records: List[dict]) -> Generator[dict, None, None]:
strain_name_pattern = re.compile(args.strain_regex)

for index, record in enumerate(records):
transform_name(
record,
index,
strain_name_pattern,
args.backup_fields,
)

json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
print()
yield record
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Running the command with no arguments produces the expected output

$ echo '{"strain":"OC43"}' \
> | ${AUGUR} curate transform-strain-name
{"strain": "OC43"}

Providing a strain regex to the command produces the expected output when the strain matches

$ echo '{"strain":"OC43"}' \
> | ${AUGUR} curate transform-strain-name --strain-regex '^\w{2}\d{2}$'
{"strain": "OC43"}
Providing a strain regex to the command produces an empty field and a warning when the strain doesn't match

$ echo '{"strain":"OC43"}' \
> | ${AUGUR} curate transform-strain-name --strain-regex '^\d{2}\w{2}$'
WARNING: Record number 0 has an empty string as the strain name.
{"strain": ""}

Providing a backup field produces the expected output

$ echo '{"potential-strain":"OC43"}' \
> | ${AUGUR} curate transform-strain-name --backup-fields potential-strain
{"potential-strain": "OC43", "strain": "OC43"}


Multiple backup fields produce the expected output

$ echo '{"potential-strain2":"OC43"}' \
> | ${AUGUR} curate transform-strain-name --backup-fields potential-strain potential-strain2
{"potential-strain2": "OC43", "strain": "OC43"}

0 comments on commit ff2faf9

Please sign in to comment.