Skip to content

Commit

Permalink
Port parse-genbank-location into augur curate style [#1485]
Browse files Browse the repository at this point in the history
* Convert script over to expected sub-command style
* Add `--location-field` argument to preserve back-compat with older
  data files
* Add type hints throughout
* Add tests
  • Loading branch information
genehack committed Jul 1, 2024
1 parent 8ecd4eb commit 2100393
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 30 deletions.
3 changes: 2 additions & 1 deletion augur/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
from augur.io.sequences import write_records_to_fasta
from augur.types import DataErrorMethod
from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors
from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location


SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
Expand All @@ -24,6 +24,7 @@
apply_geolocation_rules,
apply_record_annotations,
abbreviate_authors,
parse_genbank_location,
]


Expand Down
85 changes: 56 additions & 29 deletions augur/curate/parse_genbank_location.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,86 @@
#!/usr/bin/env python3
"""
Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate
fields: 'country', 'division', and 'location'. Checks that a record is from
GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq".
Parses GenBank's 'location' field of the NDJSON record to 3 separate
fields: 'country', 'division', and 'location'.
Outputs the modified record to stdout.
Checks that a record is from GenBank by verifying that the 'database'
field has a value of "GenBank" or "RefSeq".
"""
import json
from sys import stdin, stderr, stdout

import argparse
from typing import Generator, List
from augur.io.print import print_err
from augur.utils import first_line

def parse_location(record: dict) -> dict:
# Expected pattern for the location field is "<country_value>[:<region>][, <locality>]"

def parse_location(
record: dict,
location_field_name: str,
) -> dict:
# Expected pattern for the location field is
# "<country_value>[:<region>][, <locality>]"
#
# See GenBank docs for their "country" field:
# https://www.ncbi.nlm.nih.gov/genbank/collab/country/
location_field = record.get("location", "")
location_field = record.get(location_field_name, "")
if not location_field:
print(
"`transform-genbank-location` requires a `location` field; this record does not have one.",
file=stderr,
print_err(
f"`parse-genbank-location` requires a `{location_field_name}` field; this record does not have one.",
)
# bail early because we're not gonna make any changes
return record

geographic_data = location_field.split(':')
geographic_data = location_field.split(":")

country = geographic_data[0]
division = ''
location = ''
division = ""
location = ""

if len(geographic_data) == 2:
division , _ , location = geographic_data[1].partition(',')
division, _, location = geographic_data[1].partition(",")

record['country'] = country.strip()
record['division'] = division.strip()
record['location'] = location.strip()
record["country"] = country.strip()
record["division"] = division.strip()
record["location"] = location.strip()

return record


if __name__ == '__main__':
def register_parser(
parent_subparsers: argparse._SubParsersAction,
) -> argparse._SubParsersAction:
parser = parent_subparsers.add_parser(
"parse-genbank-location",
parents=[parent_subparsers.shared_parser], # type: ignore
help=first_line(__doc__),
)

parser.add_argument(
"--location-field",
default="geo_loc_name",
help="The field containing the location, "
+ "in the format `<geo_loc_name>[:<region>][, <locality>]`",
)

return parser

for record in stdin:
record = json.loads(record)

database = record.get('database', '')
if database in {'GenBank', 'RefSeq'}:
parse_location(record)
def run(
args: argparse.Namespace,
records: List[dict],
) -> Generator[dict, None, None]:
for record in records:
database = record.get("database", "")
if database in {"GenBank", "RefSeq"}:
parse_location(
record,
args.location_field,
)
else:
if database:
error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq"."""
else:
error_msg = "Record must contain `database` field to use `transform-genbank-location.`"

print(error_msg, file=stderr)
print_err(error_msg)

json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
print()
yield record
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Running the command with no arguments produces the expected output

$ echo '{"geo_loc_name":"Canada:Vancouver", "database":"GenBank"}' \
> | ${AUGUR} curate parse-genbank-location
{"geo_loc_name": "Canada:Vancouver", "database": "GenBank", "country": "Canada", "division": "Vancouver", "location": ""}

`--location-field` can be used to specify a different field name

$ echo '{"location":"Canada:Vancouver", "database":"GenBank"}' \
> | ${AUGUR} curate parse-genbank-location --location-field location
{"location": "", "database": "GenBank", "country": "Canada", "division": "Vancouver"}

`RefSeq` works as a database value

$ echo '{"geo_loc_name":"Canada:Vancouver", "database":"RefSeq"}' \
> | ${AUGUR} curate parse-genbank-location
{"geo_loc_name": "Canada:Vancouver", "database": "RefSeq", "country": "Canada", "division": "Vancouver", "location": ""}

Record with only a `country` value results in expected output

$ echo '{"geo_loc_name":"Canada", "database":"GenBank"}' \
> | ${AUGUR} curate parse-genbank-location
{"geo_loc_name": "Canada", "database": "GenBank", "country": "Canada", "division": "", "location": ""}

Record with `country`, `region`, and `location` values results in expected output

$ echo '{"geo_loc_name":"France:Cote d'\''Azur, Antibes", "database":"GenBank"}' \
> | ${AUGUR} curate parse-genbank-location
{"geo_loc_name": "France:Cote d'Azur, Antibes", "database": "GenBank", "country": "France", "division": "Cote d'Azur", "location": "Antibes"}
31 changes: 31 additions & 0 deletions tests/functional/curate/cram/parse-genbank-location/errors.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Records without a `database` field result in the expected warning

$ echo '{"geo_loc_name":"Canada:Vancouver"}' \
> | ${AUGUR} curate parse-genbank-location
Record must contain `database` field to use `transform-genbank-location.`
{"geo_loc_name": "Canada:Vancouver"}

Records with a `database` field with an unsupported value result in the expected warning

$ echo '{"geo_loc_name":"Canada:Vancouver", "database":"database"}' \
> | ${AUGUR} curate parse-genbank-location
Database value of database not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq".
{"geo_loc_name": "Canada:Vancouver", "database": "database"}

Records without a `location` field result in the expected warning

$ echo '{"database":"GenBank"}' \
> | ${AUGUR} curate parse-genbank-location
`parse-genbank-location` requires a `geo_loc_name` field; this record does not have one.
{"database": "GenBank"}

Records without a `location` field and a custom `--location-field` result in the expected warning

$ echo '{"database":"GenBank"}' \
> | ${AUGUR} curate parse-genbank-location --location-field location
`parse-genbank-location` requires a `location` field; this record does not have one.
{"database": "GenBank"}

0 comments on commit 2100393

Please sign in to comment.