-
Notifications
You must be signed in to change notification settings - Fork 129
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Convert script over to expected sub-command style * Make code match comments (comments say "et al." in multiple places; code was adding "et al") * Add type hints throughout * Add tests
- Loading branch information
Showing
7 changed files
with
213 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
""" | ||
Abbreviates a full list of authors to be '<first author> et al.' of the NDJSON | ||
record from stdin and outputs modified records to stdout. | ||
Note: This is a "best effort" approach and can potentially mangle the author name. | ||
""" | ||
|
||
import argparse | ||
import re | ||
from typing import Generator, List | ||
from augur.io.print import print_err | ||
from augur.utils import first_line | ||
|
||
|
||
def parse_authors( | ||
record: dict, | ||
authors_field: str, | ||
default_value: str, | ||
index: int, | ||
abbr_authors_field: str = None, | ||
) -> dict: | ||
# Strip and normalize whitespace | ||
new_authors = re.sub(r"\s+", " ", record[authors_field]) | ||
|
||
if new_authors == "": | ||
new_authors = default_value | ||
else: | ||
# Split authors list on comma/semicolon | ||
# OR "and"/"&" with at least one space before and after | ||
new_authors = re.split(r"(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)", new_authors)[0] | ||
|
||
# if it does not already end with " et al.", add it | ||
if not new_authors.strip(". ").endswith(" et al"): | ||
new_authors += " et al." | ||
|
||
if abbr_authors_field: | ||
if record.get(abbr_authors_field): | ||
print_err( | ||
f"WARNING: the {abbr_authors_field!r} field already exists", | ||
f"in record {index} and will be overwritten!", | ||
) | ||
|
||
record[abbr_authors_field] = new_authors | ||
else: | ||
record[authors_field] = new_authors | ||
|
||
return record | ||
|
||
|
||
def register_parser( | ||
parent_subparsers: argparse._SubParsersAction, | ||
) -> argparse._SubParsersAction: | ||
parser = parent_subparsers.add_parser( | ||
"abbreviate-authors", | ||
parents=[parent_subparsers.shared_parser], # type: ignore | ||
help=first_line(__doc__), | ||
) | ||
|
||
parser.add_argument( | ||
"--authors-field", | ||
default="authors", | ||
help="The field containing list of authors.", | ||
) | ||
parser.add_argument( | ||
"--default-value", | ||
default="?", | ||
help="Default value to use if authors list is empty.", | ||
) | ||
parser.add_argument( | ||
"--abbr-authors-field", | ||
help="The field for the generated abbreviated authors. " | ||
+ "If not provided, the original authors field will be modified.", | ||
) | ||
|
||
return parser | ||
|
||
|
||
def run(args: argparse.Namespace, records: List[dict]) -> Generator[dict, None, None]: | ||
for index, record in enumerate(records): | ||
parse_authors( | ||
record, | ||
args.authors_field, | ||
args.default_value, | ||
index, | ||
args.abbr_authors_field, | ||
) | ||
|
||
yield record |
71 changes: 71 additions & 0 deletions
71
tests/functional/curate/cram/abbreviate-authors/default-behavior.t
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
Running the command with no arguments produces the expected output | ||
|
||
$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors | ||
{"authors": "Troesemeier et al."} | ||
|
||
`--authors-field` can be used to set an alternative field name | ||
|
||
$ echo '{"author-list":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors \ | ||
> --authors-field="author-list" | ||
{"author-list": "Troesemeier et al."} | ||
|
||
`--default-value` can be used to provide a default for an empty field | ||
|
||
$ echo '{"authors":""}' \ | ||
> | ${AUGUR} curate abbreviate-authors \ | ||
> --default="??" | ||
{"authors": "??"} | ||
|
||
`--abbr-authors-field` can be used to put the abbreviated authors into a different field | ||
|
||
$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors \ | ||
> --abbr-authors-field="abbr-authors" | ||
{"authors": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."} | ||
|
||
`--authors-field` and `--default-value` work together | ||
|
||
$ echo '{"author-list":""}' \ | ||
> | ${AUGUR} curate abbreviate-authors \ | ||
> --authors-field="author-list" \ | ||
> --default-value="???" | ||
{"author-list": "???"} | ||
|
||
`--authors-field` and `--abbr-authors-field` work together | ||
|
||
$ echo '{"author-list":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors \ | ||
> --authors-field="author-list" \ | ||
> --abbr-authors-field="abbr-authors" | ||
{"author-list": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."} | ||
|
||
`--default-value` and `--abbr-authors-field` work together | ||
|
||
$ echo '{"authors":""}' \ | ||
> | ${AUGUR} curate abbreviate-authors \ | ||
> --default-value="?" \ | ||
> --abbr-authors-field="abbr-authors" | ||
{"authors": "", "abbr-authors": "?"} | ||
|
||
All three options work together | ||
|
||
$ echo '{"author-list":""}' \ | ||
> | ${AUGUR} curate abbreviate-authors \ | ||
> --authors-field="author-list" \ | ||
> --abbr-authors-field="abbr-authors" \ | ||
> --default-value="?!" | ||
{"author-list": "", "abbr-authors": "?!"} | ||
|
||
Running the command with no arguments and multiple records produces the expected output | ||
|
||
$ echo '{"authors":"Troesemeier,J.-H. & Musso,D."} | ||
> {"authors":"Bluemel,J. and Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors | ||
{"authors": "Troesemeier et al."} | ||
{"authors": "Bluemel et al."} |
9 changes: 9 additions & 0 deletions
9
tests/functional/curate/cram/abbreviate-authors/only-one-et-al.t
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
If the authors list already ends in `et al.` don't add another. | ||
|
||
$ echo '{"authors":"Troesemeier et al."}' \ | ||
> | ${AUGUR} curate abbreviate-authors | ||
{"authors": "Troesemeier et al."} |
21 changes: 21 additions & 0 deletions
21
tests/functional/curate/cram/abbreviate-authors/separator-support.t
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
Semi-colon separator is supported | ||
|
||
$ echo '{"authors":"Troesemeier,J.-H.; Musso,D.; Bluemel,J.; and Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors | ||
{"authors": "Troesemeier et al."} | ||
|
||
Ampersand separator is supported | ||
|
||
$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. & Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors | ||
{"authors": "Troesemeier et al."} | ||
|
||
Semi-colons and ampersand separators together are supported | ||
|
||
$ echo '{"authors":"Troesemeier,J.-H.; Musso,D.; Bluemel,J. & Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors | ||
{"authors": "Troesemeier et al."} |
9 changes: 9 additions & 0 deletions
9
tests/functional/curate/cram/abbreviate-authors/strip-whitespace-in-authors.t
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
Whitespace in author list gets stripped out | ||
|
||
$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A."}' \ | ||
> | ${AUGUR} curate abbreviate-authors | ||
{"authors": "Troesemeier et al."} |
13 changes: 13 additions & 0 deletions
13
tests/functional/curate/cram/abbreviate-authors/warn-about-overwrite.t
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
Overwriting an existing abbr-author-field generates a warning | ||
|
||
`--abbr-authors-field` can be used to put the abbreviated authors into a different field | ||
|
||
$ echo '{"authors":"Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors":"I EXIST"}' \ | ||
> | ${AUGUR} curate abbreviate-authors \ | ||
> --abbr-authors-field="abbr-authors" | ||
WARNING: the 'abbr-authors' field already exists in record 0 and will be overwritten! | ||
{"authors": "Troesemeier,J.-H., Musso,D., Bluemel,J. and Baylis,S.A.", "abbr-authors": "Troesemeier et al."} |