Skip to content

Commit

Permalink
Copy abbreviate-authors script from ingest into augur curate [#1483]
Browse files Browse the repository at this point in the history
* Convert script over to expected sub-command style
* Make code match comments (comments say "et al." in multiple places;
  code was adding "et al")
* Add type hints throughout
* Add tests
  • Loading branch information
genehack committed Jun 28, 2024
1 parent 2b51ab4 commit 748a588
Show file tree
Hide file tree
Showing 6 changed files with 204 additions and 1 deletion.
3 changes: 2 additions & 1 deletion augur/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
from augur.io.sequences import write_records_to_fasta
from augur.types import DataErrorMethod
from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations
from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors


SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
Expand All @@ -23,6 +23,7 @@
titlecase,
apply_geolocation_rules,
apply_record_annotations,
abbreviate_authors,
]


Expand Down
88 changes: 88 additions & 0 deletions augur/curate/abbreviate_authors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
Abbreviates a full list of authors to be '<first author> et al.' of the NDJSON
record from stdin and outputs modified records to stdout.
Note: This is a "best effort" approach and can potentially mangle the author name.
"""

import argparse
import re
from typing import Generator, List
from augur.io.print import print_err
from augur.utils import first_line


def parse_authors(
record: dict,
authors_field: str,
default_value: str,
index: int,
abbr_authors_field: str = None,
) -> dict:
# Strip and normalize whitespace
new_authors = re.sub(r"\s+", " ", record[authors_field])

if new_authors == "":
new_authors = default_value
else:
# Split authors list on comma/semicolon
# OR "and"/"&" with at least one space before and after
new_authors = re.split(r"(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)", new_authors)[0]

# if it does not already end with " et al.", add it
if not new_authors.strip(". ").endswith(" et al"):
new_authors += " et al."

if abbr_authors_field:
if record.get(abbr_authors_field):
print_err(
f"WARNING: the {abbr_authors_field!r} field already exists",
f"in record {index} and will be overwritten!",
)

record[abbr_authors_field] = new_authors
else:
record[authors_field] = new_authors

return record


def register_parser(
parent_subparsers: argparse._SubParsersAction,
) -> argparse._SubParsersAction:
parser = parent_subparsers.add_parser(
"abbreviate-authors",
parents=[parent_subparsers.shared_parser], # type: ignore
help=first_line(__doc__),
)

parser.add_argument(
"--authors-field",
default="authors",
help="The field containing list of authors.",
)
parser.add_argument(
"--default-value",
default="?",
help="Default value to use if authors list is empty.",
)
parser.add_argument(
"--abbr-authors-field",
help="The field for the generated abbreviated authors. "
+ "If not provided, the original authors field will be modified.",
)

return parser


def run(args: argparse.Namespace, records: List[dict]) -> Generator[dict, None, None]:
for index, record in enumerate(records):
parse_authors(
record,
args.authors_field,
args.default_value,
index,
args.abbr_authors_field,
)

yield record
71 changes: 71 additions & 0 deletions tests/functional/curate/cram/abbreviate-authors/default-behavior.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Running the command with no arguments produces the expected output

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}

`--authors-field` can be used to set an alternative field name

$ echo '{"author-list":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors \
> --authors-field="author-list"
{"author-list": "Troesemeier et al."}

`--default-value` can be used to provide a default for an empty field

$ echo '{"authors":""}' \
> | ${AUGUR} curate abbreviate-authors \
> --default="??"
{"authors": "??"}

`--abbr-authors-field` can be used to put the abbreviated authors into a different field

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors \
> --abbr-authors-field="abbr-authors"
{"authors": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."}

`--authors-field` and `--default-value` work together

$ echo '{"author-list":""}' \
> | ${AUGUR} curate abbreviate-authors \
> --authors-field="author-list" \
> --default-value="???"
{"author-list": "???"}

`--authors-field` and `--abbr-authors-field` work together

$ echo '{"author-list":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors \
> --authors-field="author-list" \
> --abbr-authors-field="abbr-authors"
{"author-list": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."}

`--default-value` and `--abbr-authors-field` work together

$ echo '{"authors":""}' \
> | ${AUGUR} curate abbreviate-authors \
> --default-value="?" \
> --abbr-authors-field="abbr-authors"
{"authors": "", "abbr-authors": "?"}

All three options work together

$ echo '{"author-list":""}' \
> | ${AUGUR} curate abbreviate-authors \
> --authors-field="author-list" \
> --abbr-authors-field="abbr-authors" \
> --default-value="?!"
{"author-list": "", "abbr-authors": "?!"}

Running the command with no arguments and multiple records produces the expected output

$ echo '{"authors":"Troesemeier,J.-H. & Musso,D."}
> {"authors":"Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}
{"authors": "Bluemel et al."}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Semi-colon separator is supported

$ echo '{"authors":"Troesemeier,J.-H.; Musso,D.; Bluemel,J.; and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}

Ampersand separator is supported

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. & Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}

Semi-colons and ampersand separators together are supported

$ echo '{"authors":"Troesemeier,J.-H.; Musso,D.; Bluemel,J. & Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Whitespace in author list gets stripped out

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \
> | ${AUGUR} curate abbreviate-authors
{"authors": "Troesemeier et al."}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

Overwriting an existing abbr-author-field generates a warning

`--abbr-authors-field` can be used to put the abbreviated authors into a different field

$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors":"I EXIST"}' \
> | ${AUGUR} curate abbreviate-authors \
> --abbr-authors-field="abbr-authors"
WARNING: the 'abbr-authors' field already exists in record 0 and will be overwritten!
{"authors": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."}

0 comments on commit 748a588

Please sign in to comment.