Skip to content

Commit

Permalink
Adds augur curate titlecase sub-command
Browse files Browse the repository at this point in the history
Adds a new sub-command `augur curate titlecase` based on the transform-string-fields
script in the monkeypox repo. The `augur curate normalize` sub-command
has already been added based on the same script (#1039).

Overall this is part of filling in the gaps in the augur curate suite of commands (#860),
specifically addressing issue (#999), and is a follow-up to #1039.

`augur curate titlecase` would transform the values of a given metadata field to titlecase.
This is useful for normalizing the values of a string that may contain inconsistent
capitalization such as "North America" and "north america".

Co-authored-by: Jover Lee <joverlee521@gmail.com>
  • Loading branch information
j23414 and joverlee521 committed Apr 19, 2023
1 parent e6ff9ef commit 876cf98
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 1 deletion.
3 changes: 2 additions & 1 deletion augur/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
from augur.io.sequences import write_records_to_fasta
from augur.types import DataErrorMethod
from . import normalize_strings, passthru
from . import normalize_strings, passthru, titlecase


SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
SUBCOMMANDS = [
passthru,
normalize_strings,
titlecase,
]


Expand Down
127 changes: 127 additions & 0 deletions augur/curate/titlecase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python3
"""
Standardizes string fields of the NDJSON record from stdin and outputs the
modified record to stdout.
"""
import argparse

import re
import unicodedata
from typing import Optional, Set, Union

from augur.errors import AugurError
from augur.io.print import print_err
from augur.types import DataErrorMethod

def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("titlecase",
parents = [parent_subparsers.shared_parser],
help = __doc__)

required = parser.add_argument_group(title="REQUIRED")
required.add_argument("--titlecase-fields", nargs="*",
help="List of fields to convert to titlecase.", required=True)

optional = parser.add_argument_group(title="OPTIONAL")
optional.add_argument("--articles", nargs="*",
help="List of articles that should not be cast to titlecase.")
optional.add_argument("--abbreviations", nargs="*",
help="List of abbreviations that should not be cast to titlecase, keeps uppercase.")

optional.add_argument("--failure-reporting",
type=DataErrorMethod,
choices=[ method for method in DataErrorMethod ],
default=DataErrorMethod.ERROR_FIRST,
help="How should failed titlecase formatting be reported.")
return parser


def titlecase(text: Union[str, None], articles: Set[str] = {}, abbreviations: Set[str] = {}) -> Optional[str]:
"""
Originally from nextstrain/ncov-ingest
Returns a title cased location name from the given location name
*tokens*. Ensures that no tokens contained in the *whitelist_tokens* are
converted to title case.
>>> articles = {'a', 'and', 'of', 'the', 'le'}
>>> abbreviations = {'USA', 'DC'}
>>> titlecase("the night OF THE LIVING DEAD", articles)
'The Night of the Living Dead'
>>> titlecase("BRAINE-LE-COMTE, FRANCE", articles)
'Braine-le-Comte, France'
>>> titlecase("auvergne-RHÔNE-alpes", articles)
'Auvergne-Rhône-Alpes'
>>> titlecase("washington DC, usa", articles, abbreviations)
'Washington DC, USA'
"""
if not isinstance(text, str):
return None

words = enumerate(re.split(r'\b', text))

def changecase(index, word):
casefold = word.casefold()
upper = word.upper()

if upper in abbreviations:
return upper
elif casefold in articles and index != 1:
return word.lower()
else:
return word.title()

return ''.join(changecase(i, w) for i, w in words)


def run(args, records):
failures = []
failure_reporting = args.failure_reporting

articles = set()
if args.articles:
articles = set(args.articles)

abbreviations = set()
if args.abbreviations:
abbreviations = set(args.abbreviations)

for index, record in enumerate(records):
record = record.copy()
record_id = index

for field in args.titlecase_fields:
titlecased_string = titlecase(record.get(field, ""), articles, abbreviations)

failure_message = f"Failed to titlecase {field} in record {record_id}"
if titlecased_string is None:
if failure_reporting is DataErrorMethod.ERROR_FIRST:
raise AugurError(failure_message)

if failure_reporting is DataErrorMethod.WARN:
print_err(f"WARNING: {failure_message}")

# Keep track of failures for final summary
failures.append((record_id, field, record.get(field, "")))
else:
record[field] = titlecased_string

yield record

if failure_reporting is not DataErrorMethod.SILENT and failures:
failure_message = (
"Unable to change to titlecase for the following (record, field string):\n" + \
'\n'.join(map(repr, failures))
)
if failure_reporting is DataErrorMethod.ERROR_ALL:
raise AugurError(failure_message)

elif failure_reporting is DataErrorMethod.WARN:
print_err(f"WARNING: {failure_message}")

else:
raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}")
18 changes: 18 additions & 0 deletions tests/functional/curate/cram/titlecase.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Setup

$ pushd "$TESTDIR" > /dev/null
$ export AUGUR="${AUGUR:-../../../../bin/augur}"


Create NDJSON file for testing titlecase with different forms

$ cat >$TMP/records.ndjson <<~~
> {"record": 1, "authors": "john smith", "author2": "Jane Doe"}
> ~~


Test output with Unicode normalization form "NFKC".

$ cat $TMP/records.ndjson \
> | ${AUGUR} curate titlecase --titlecase-fields "authors" "author2"
{"record": 1, "authors": "John Smith", "author2": "Jane Doe"}

0 comments on commit 876cf98

Please sign in to comment.