Skip to content

Commit

Permalink
Port abbreviate-authors script from ingest into augur curate style [
Browse files Browse the repository at this point in the history
#1483]

* Convert script over to expected sub-command style
* Make code match comments (comments say "et al." in multiple places;
  code was adding "et al")
* Add type hints throughout
* Add tests
  • Loading branch information
genehack committed Jun 28, 2024
1 parent 2a1bcd8 commit 67b1517
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 29 deletions.
3 changes: 2 additions & 1 deletion augur/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
from augur.io.sequences import write_records_to_fasta
from augur.types import DataErrorMethod
from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations
from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors


SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
Expand All @@ -23,6 +23,7 @@
titlecase,
apply_geolocation_rules,
apply_record_annotations,
abbreviate_authors,
]


Expand Down
78 changes: 50 additions & 28 deletions augur/curate/abbreviate_authors.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,43 @@
#!/usr/bin/env python3
"""
Abbreviates a full list of authors to be '<first author> et al.' of the NDJSON
record from stdin and outputs modified records to stdout.
Note: This is a "best effort" approach and can potentially mangle the author name.
"""

import argparse
import json
import re
from sys import stderr, stdin, stdout
from typing import Generator, List
from augur.io.print import print_err
from augur.utils import first_line


def parse_authors(record: dict, authors_field: str, default_value: str,
index: int, abbr_authors_field: str = None) -> dict:
def parse_authors(
record: dict,
authors_field: str,
default_value: str,
index: int,
abbr_authors_field: str = None,
) -> dict:
# Strip and normalize whitespace
new_authors = re.sub(r'\s+', ' ', record[authors_field])
new_authors = re.sub(r"\s+", " ", record[authors_field])

if new_authors == "":
new_authors = default_value
else:
# Split authors list on comma/semicolon
# OR "and"/"&" with at least one space before and after
new_authors = re.split(r'(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)', new_authors)[0]
new_authors = re.split(r"(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)", new_authors)[0]

# if it does not already end with " et al.", add it
if not new_authors.strip('. ').endswith(" et al"):
new_authors += ' et al'
if not new_authors.strip(". ").endswith(" et al"):
new_authors += " et al."

if abbr_authors_field:
if record.get(abbr_authors_field):
print(
print_err(
f"WARNING: the {abbr_authors_field!r} field already exists",
f"in record {index} and will be overwritten!",
file=stderr
)

record[abbr_authors_field] = new_authors
Expand All @@ -42,25 +47,42 @@ def parse_authors(record: dict, authors_field: str, default_value: str,
return record


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
def register_parser(
parent_subparsers: argparse._SubParsersAction,
) -> argparse._SubParsersAction:
parser = parent_subparsers.add_parser(
"abbreviate-authors",
parents=[parent_subparsers.shared_parser], # type: ignore
help=first_line(__doc__),
)

parser.add_argument(
"--authors-field",
default="authors",
help="The field containing list of authors.",
)
parser.add_argument(
"--default-value",
default="?",
help="Default value to use if authors list is empty.",
)
parser.add_argument(
"--abbr-authors-field",
help="The field for the generated abbreviated authors. "
+ "If not provided, the original authors field will be modified.",
)
parser.add_argument("--authors-field", default="authors",
help="The field containing list of authors.")
parser.add_argument("--default-value", default="?",
help="Default value to use if authors list is empty.")
parser.add_argument("--abbr-authors-field",
help="The field for the generated abbreviated authors. " +
"If not provided, the original authors field will be modified.")

args = parser.parse_args()
return parser

for index, record in enumerate(stdin):
record = json.loads(record)

parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field)
def run(args: argparse.Namespace, records: List[dict]) -> Generator[dict, None, None]:
for index, record in enumerate(records):
parse_authors(
record,
args.authors_field,
args.default_value,
index,
args.abbr_authors_field,
)

json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
print()
yield record
71 changes: 71 additions & 0 deletions tests/functional/curate/cram/abbreviate-authors/default-behavior.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Running the command with no arguments produces the expected output

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}

`--authors-field` can be used to set an alternative field name

$ echo '{"author-list":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors \
> --authors-field="author-list"
{"author-list": "Troesemeier et al."}

`--default-value` can be used to provide a default for an empty field

$ echo '{"authors":""}' \
> | ${AUGUR} curate abbreviate-authors \
> --default="??"
{"authors": "??"}

`--abbr-authors-field` can be used to put the abbreviated authors into a different field

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors \
> --abbr-authors-field="abbr-authors"
{"authors": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."}

`--authors-field` and `--default-value` work together

$ echo '{"author-list":""}' \
> | ${AUGUR} curate abbreviate-authors \
> --authors-field="author-list" \
> --default-value="???"
{"author-list": "???"}

`--authors-field` and `--abbr-authors-field` work together

$ echo '{"author-list":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors \
> --authors-field="author-list" \
> --abbr-authors-field="abbr-authors"
{"author-list": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."}

`--default-value` and `--abbr-authors-field` work together

$ echo '{"authors":""}' \
> | ${AUGUR} curate abbreviate-authors \
> --default-value="?" \
> --abbr-authors-field="abbr-authors"
{"authors": "", "abbr-authors": "?"}

All three options work together

$ echo '{"author-list":""}' \
> | ${AUGUR} curate abbreviate-authors \
> --authors-field="author-list" \
> --abbr-authors-field="abbr-authors" \
> --default-value="?!"
{"author-list": "", "abbr-authors": "?!"}

Running the command with no arguments and multiple records produces the expected output

$ echo '{"authors":"Troesemeier,J.-H. & Musso,D."}
> {"authors":"Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}
{"authors": "Bluemel et al."}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

If the authors list already ends in `et al.` don't add another.

$ echo '{"authors":"Troesemeier et al."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Semi-colon separator is supported

$ echo '{"authors":"Troesemeier,J.-H.; Musso,D.; Bluemel,J.; and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}

Ampersand separator is supported

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. & Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}

Semi-colons and ampersand separators together are supported

$ echo '{"authors":"Troesemeier,J.-H.; Musso,D.; Bluemel,J. & Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Whitespace in author list gets stripped out

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Overwriting an existing abbr-author-field generates a warning

`--abbr-authors-field` can be used to put the abbreviated authors into a different field

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors":"I EXIST"}' \
> | ${AUGUR} curate abbreviate-authors \
> --abbr-authors-field="abbr-authors"
WARNING: the 'abbr-authors' field already exists in record 0 and will be overwritten!
{"authors": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."}

0 comments on commit 67b1517

Please sign in to comment.