Merge pull request #20: Add transform strain names script

nextstrain · Sep 18, 2023 · c02fa81 · c02fa81
2 parents f9973e7 + 6f196f7
commit c02fa81
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 0 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -13,3 +13,11 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: nextstrain/.github/actions/shellcheck@master
+
+  cram:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+      - run: pip install cram
+      - run: cram tests/
diff --git a/README.md b/README.md
@@ -114,6 +114,7 @@ Potential augur curate scripts
 - [transform-authors](transform-authors) - Abbreviates full author lists to '<first author> et al.'
 - [transform-field-names](transform-field-names) - Rename fields of NDJSON records
 - [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"<country_value>[:<region>][, <locality>]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/)
+- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields.
 
 ## Software requirements
 

diff --git a/tests/transform-strain-names/transform-strain-names.t b/tests/transform-strain-names/transform-strain-names.t
@@ -0,0 +1,17 @@
+Look for strain name in "strain" or a list of backup fields.
+
+If strain entry exists, do not do anything.
+
+  $ echo '{"strain": "i/am/a/strain", "strain_s": "other"}' \
+  >   | $TESTDIR/../../transform-strain-names \
+  >       --strain-regex '^.+$' \
+  >       --backup-fields strain_s accession
+  {"strain":"i/am/a/strain","strain_s":"other"}
+
+If strain entry does not exists, search the backup fields
+
+  $ echo '{"strain_s": "other"}' \
+  >   | $TESTDIR/../../transform-strain-names \
+  >       --strain-regex '^.+$' \
+  >       --backup-fields accession strain_s 
+  {"strain_s":"other","strain":"other"}
diff --git a/transform-strain-names b/transform-strain-names
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+"""
+Verifies strain name pattern in the 'strain' field of the NDJSON record from
+stdin. Adds a 'strain' field to the record if it does not already exist.
+
+Outputs the modified records to stdout.
+"""
+import argparse
+import json
+import re
+from sys import stderr, stdin, stdout
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--strain-regex", default="^.+$",
+        help="Regex pattern for strain names. " +
+             "Strain names that do not match the pattern will be dropped.")
+    parser.add_argument("--backup-fields", nargs="*",
+        help="List of backup fields to use as strain name if the value in 'strain' " +
+             "does not match the strain regex pattern. " +
+             "If multiple fields are provided, will use the first field that has a non-empty string.")
+
+    args = parser.parse_args()
+
+    strain_name_pattern = re.compile(args.strain_regex)
+
+    for index, record in enumerate(stdin):
+        record = json.loads(record)
+
+        # Verify strain name matches the strain regex pattern
+        if strain_name_pattern.match(record.get('strain', '')) is None:
+            # Default to empty string if not matching pattern
+            record['strain'] = ''
+            # Use non-empty value of backup fields if provided
+            if args.backup_fields:
+                for field in args.backup_fields:
+                    if record.get(field):
+                        record['strain'] = str(record[field])
+                        break
+
+        if record['strain'] == '':
+            print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr)
+
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()