nextstrain · joverlee521 · Jun 28, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,8 +4,9 @@
 
 ### Features
 
-* Added a new sub-command `augur curate apply-geolocation-rules` to apply user curated geolocation rules to the geolocation fields in a metadata file. Previously, this was available as a script script within the nextstrain/ingest repo. [#1491][] (@victorlin)
+* Added a new sub-command `augur curate apply-geolocation-rules` to apply user curated geolocation rules to the geolocation fields in a metadata file. Previously, this was available as a script within the nextstrain/ingest repo. [#1491][] (@victorlin)
 * Added a default color for the "Asia" region that will be used in `augur export` is no custom colors are provided. [#1490][] (@joverlee521)
+* Added a new sub-command `augur curate apply-record-annotations` to apply user curated annotations to existing fields in a metadata file. Previously, this was available as a `merge-user-metadata` in the nextstrain/ingest repo. [#1495][] (@joverlee521)
 
 ### Bug Fixes
 
@@ -16,6 +17,7 @@
 [#1490]: https://github.com/nextstrain/augur/pull/1490
 [#1491]: https://github.com/nextstrain/augur/pull/1491
 [#1493]: https://github.com/nextstrain/augur/pull/1493
+[#1495]: https://github.com/nextstrain/augur/pull/1495
 
 ## 24.4.0 (15 May 2024)
 

diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
@@ -12,7 +12,7 @@
 from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
 from augur.io.sequences import write_records_to_fasta
 from augur.types import DataErrorMethod
-from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules
+from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations
 
 
 SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
@@ -22,6 +22,7 @@
     format_dates,
     titlecase,
     apply_geolocation_rules,
+    apply_record_annotations,
 ]
 
 

diff --git a/augur/curate/apply_record_annotations.py b/augur/curate/apply_record_annotations.py
@@ -0,0 +1,56 @@
+"""
+Applies record annotations to overwrite field values.
+This does not do any additional transformations on top of the annotations.
+"""
+import csv
+from collections import defaultdict
+from augur.errors import AugurError
+from augur.io.print import print_err
+from augur.utils import first_line
+
+
+def register_parser(parent_subparsers):
+    parser = parent_subparsers.add_parser("apply-record-annotations",
+        parents=[parent_subparsers.shared_parser],
+        help=first_line(__doc__))
+
+    parser.add_argument("--annotations", metavar="TSV", required=True,
+        help="Manually curated annotations TSV file. " +
+             "The TSV should not have a header and should have exactly three columns: " +
+             "id to match existing metadata, field name, and field value. " +
+             "If there are multiple annotations for the same id and field, then the last value is used. " +
+             "Lines starting with '#' are treated as comments. " +
+             "Any '#' after the field value are treated as comments.")
+    parser.add_argument("--id-field", default="accession",
+        help="The ID field in the metadata to use to merge with the annotations.")
+
+    return parser
+
+
+def run(args, records):
+    annotations = defaultdict(dict)
+    with open(args.annotations, 'r') as annotations_fh:
+        csv_reader = csv.reader(annotations_fh, delimiter='\t')
+        for row in csv_reader:
+            if not row or row[0].lstrip()[0] == '#':
+                    continue
+            elif len(row) != 3:
+                print_err("WARNING: Could not decode annotation line " + "\t".join(row))
+                continue
+            id, field, value = row
+            annotations[id][field] = value.partition('#')[0].rstrip()
+
+    for record in records:
+        record_id = record.get(args.id_field)
+        if record_id is None:
+            raise AugurError(f"ID field {args.id_field!r} does not exist in record")
+
+        record_annotations = annotations.get(record_id, {})
+        for field in list(record_annotations.keys()):
+            if field not in record:
+                print_err(f"WARNING: Skipping annotation for field {field!r} that does not exist in record")
+                del record_annotations[field]
+
+        record.update(record_annotations)
+
+        yield record
diff --git a/augur/curate/titlecase.py b/augur/curate/titlecase.py
diff --git a/tests/functional/curate/cram/apply-record-annotations/annotating-new-fields.t b/tests/functional/curate/cram/apply-record-annotations/annotating-new-fields.t
@@ -0,0 +1,21 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Test that annotations for new fields that are not in the record results in warning message.
+
+  $ cat >annotations.tsv <<~~
+  > record_2	new_field	annotation_1
+  > ~~
+
+  $ cat >records.ndjson <<~~
+  > {"accession": "record_1", "field_1": "value_1"}
+  > {"accession": "record_2", "field_1": "value_1"}
+  > ~~
+
+  $ cat records.ndjson \
+  >   |  ${AUGUR} curate apply-record-annotations \
+  >       --annotations annotations.tsv
+  WARNING: Skipping annotation for field 'new_field' that does not exist in record
+  {"accession": "record_1", "field_1": "value_1"}
+  {"accession": "record_2", "field_1": "value_1"}
diff --git a/tests/functional/curate/cram/apply-record-annotations/custom-id-field.t b/tests/functional/curate/cram/apply-record-annotations/custom-id-field.t
@@ -0,0 +1,15 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Test annotations that use a custom id field.
+
+  $ cat >annotations.tsv <<~~
+  > record_1	field_1	annotation_1
+  > ~~
+
+  $ echo '{"record_id": "record_1", "field_1": "value_1"}' \
+  >   |  ${AUGUR} curate apply-record-annotations \
+  >       --id-field record_id \
+  >       --annotations annotations.tsv
+  {"record_id": "record_1", "field_1": "annotation_1"}
diff --git a/tests/functional/curate/cram/apply-record-annotations/ignore-comments.t b/tests/functional/curate/cram/apply-record-annotations/ignore-comments.t
@@ -0,0 +1,15 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Test that comments in the annotations file are ignored.
+
+  $ cat >annotations.tsv <<~~
+  > # This is a comment.
+  > record_1	field_1	annotation_1 # This is also a comment.
+  > ~~
+
+  $ echo '{"accession": "record_1", "field_1": "value_1"}' \
+  >   |  ${AUGUR} curate apply-record-annotations \
+  >       --annotations annotations.tsv
+  {"accession": "record_1", "field_1": "annotation_1"}
diff --git a/tests/functional/curate/cram/apply-record-annotations/invalid-annotation-warnings.t b/tests/functional/curate/cram/apply-record-annotations/invalid-annotation-warnings.t
@@ -0,0 +1,21 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Using invalid annotations results in warning messages.
+Valid annotations are still applied.
+
+  $ cat >annotations.tsv <<~~
+  > record_1	field_1	annotation_1	extra_field
+  > record_1	field_1	annotation_1
+  > record_1	field_1
+  > record_1
+  > ~~
+
+  $ echo '{"accession": "record_1", "field_1": "value_1"}' \
+  >   |  ${AUGUR} curate apply-record-annotations \
+  >       --annotations annotations.tsv
+  WARNING: Could not decode annotation line record_1\tfield_1\tannotation_1\textra_field (esc)
+  WARNING: Could not decode annotation line record_1\tfield_1 (esc)
+  WARNING: Could not decode annotation line record_1
+  {"accession": "record_1", "field_1": "annotation_1"}
diff --git a/tests/functional/curate/cram/apply-record-annotations/invalid-id-field-error.t b/tests/functional/curate/cram/apply-record-annotations/invalid-id-field-error.t
@@ -0,0 +1,16 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Using a custom id field that does not exist in the record results in an error.
+
+  $ cat >annotations.tsv <<~~
+  > record_1	field_1	annotation_1
+  > ~~
+
+  $ echo '{"record_id": "record_1", "field_1": "value_1"}' \
+  >   |  ${AUGUR} curate apply-record-annotations \
+  >       --id-field bad_id \
+  >       --annotations annotations.tsv
+  ERROR: ID field 'bad_id' does not exist in record
+  [2]
diff --git a/tests/functional/curate/cram/apply-record-annotations/overwrite-existing-fields.t b/tests/functional/curate/cram/apply-record-annotations/overwrite-existing-fields.t
@@ -0,0 +1,15 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Test that annotations overwrite existing fields.
+
+  $ cat >annotations.tsv <<~~
+  > record_1	field_1	annotation_1
+  > record_1	field_2	annotation_2
+  > ~~
+
+  $ echo '{"accession": "record_1", "field_1": "value_1", "field_2": "value_2"}' \
+  >   |  ${AUGUR} curate apply-record-annotations \
+  >       --annotations annotations.tsv
+  {"accession": "record_1", "field_1": "annotation_1", "field_2": "annotation_2"}