From d5358bce7891e15acacb410cdf04e73db83285b3 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 17 Jul 2024 10:03:35 -0400
Subject: [PATCH 1/7] Replace vendored scripts with augur curate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vendored scripts for the curate rule have been ported to `augur curate`
and are available starting from Augur 25.0.0.¹

¹ <https://github.com/nextstrain/augur/releases/tag/25.0.0>
---
 ingest/config/config.yaml                     |  2 ++
 ingest/workflow/snakemake_rules/transform.smk | 14 ++++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index f7f820e..3cce905 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -24,6 +24,8 @@ transform:
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
   expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
+  # The expected field that contains the GenBank geo_loc_name
+  genbank_location_field: location
   # Titlecase rules
   titlecase:
     # Abbreviations not cast to titlecase, keeps uppercase
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
index 1dd82fc..849b8dc 100644
--- a/ingest/workflow/snakemake_rules/transform.smk
+++ b/ingest/workflow/snakemake_rules/transform.smk
@@ -52,6 +52,7 @@ rule transform:
         strain_backup_fields = config['transform']['strain_backup_fields'],
         date_fields = config['transform']['date_fields'],
         expected_date_formats = config['transform']['expected_date_formats'],
+        genbank_location_field=config["transform"]["genbank_location_field"],
         articles = config['transform']['titlecase']['articles'],
         abbreviations = config['transform']['titlecase']['abbreviations'],
         titlecase_fields = config['transform']['titlecase']['fields'],
@@ -65,27 +66,28 @@ rule transform:
     shell:
         """
         (cat {input.sequences_ndjson} \
-            | ./vendored/transform-field-names \
+            | augur curate rename \
                 --field-map {params.field_map} \
             | augur curate normalize-strings \
-            | ./vendored/transform-strain-names \
+            | augur curate transform-strain-name \
                 --strain-regex {params.strain_regex} \
                 --backup-fields {params.strain_backup_fields} \
             | augur curate format-dates \
                 --date-fields {params.date_fields} \
                 --expected-date-formats {params.expected_date_formats} \
-            | ./vendored/transform-genbank-location \
+            | augur curate parse-genbank-location \
+                --location-field {params.genbank_location_field} \
             | augur curate titlecase \
                 --titlecase-fields {params.titlecase_fields} \
                 --articles {params.articles} \
                 --abbreviations {params.abbreviations} \
-            | ./vendored/transform-authors \
+            | augur curate abbreviate-authors \
                 --authors-field {params.authors_field} \
                 --default-value {params.authors_default_value} \
                 --abbr-authors-field {params.abbr_authors_field} \
-            | ./vendored/apply-geolocation-rules \
+            | augur curate apply-geolocation-rules \
                 --geolocation-rules {input.all_geolocation_rules} \
-            | ./vendored/merge-user-metadata \
+            | augur curate apply-record-annotations \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \
             | ./bin/ndjson-to-tsv-and-fasta \

From 4b0eab1ebc18f1500a6c9e0af92756acba7cdf3a Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 17 Jul 2024 11:04:47 -0400
Subject: [PATCH 2/7] Add subset_metadata

Separate out selecting metadata columns into a rule to match the pathogen repo guide
---
 ingest/bin/ndjson-to-tsv-and-fasta            | 66 -------------------
 ingest/workflow/snakemake_rules/sort.smk      |  2 +-
 ingest/workflow/snakemake_rules/transform.smk | 26 +++++---
 3 files changed, 19 insertions(+), 75 deletions(-)
 delete mode 100755 ingest/bin/ndjson-to-tsv-and-fasta

diff --git a/ingest/bin/ndjson-to-tsv-and-fasta b/ingest/bin/ndjson-to-tsv-and-fasta
deleted file mode 100755
index 017bcc0..0000000
--- a/ingest/bin/ndjson-to-tsv-and-fasta
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parses NDJSON records from stdin to two different files: a metadata TSV and a
-sequences FASTA.
-
-Records that do not have an ID or sequence will be excluded from the output files.
-"""
-import argparse
-import csv
-import json
-from sys import stderr, stdin
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--metadata", metavar="TSV", default="data/metadata.tsv",
-        help="The output metadata TSV file")
-    parser.add_argument("--fasta", metavar="FASTA", default="data/sequences.fasta",
-        help="The output sequences FASTA file")
-    parser.add_argument("--metadata-columns", nargs="+",
-        help="List of fields from the NDJSON records to include as columns in the metadata TSV. " +
-             "Metadata TSV columns will be in the order of the columns provided.")
-    parser.add_argument("--id-field", default='strain',
-        help="Field from the records to use as the sequence ID in the FASTA file.")
-    parser.add_argument("--sequence-field", default='sequence',
-        help="Field from the record that holds the genomic sequence for the FASTA file.")
-
-    args = parser.parse_args()
-
-    with open(args.metadata, 'wt') as metadata_output:
-        with open(args.fasta, 'wt') as fasta_output:
-            metadata_csv = csv.DictWriter(
-                metadata_output,
-                args.metadata_columns,
-                restval="",
-                extrasaction='ignore',
-                delimiter='\t'
-            )
-            metadata_csv.writeheader()
-
-            for index, record in enumerate(stdin):
-                record = json.loads(record)
-
-                sequence_id = str(record.get(args.id_field, ''))
-                sequence = str(record.get(args.sequence_field, ''))
-
-                if not sequence_id:
-                    print(
-                        f"WARNING: Record number {index} does not have a sequence ID.",
-                        "This record will be excluded from the output files.",
-                        file=stderr
-                    )
-                elif not sequence:
-                    print(
-                        f"WARNING: Record number {index} does not have a sequence.",
-                        "This record will be excluded from the output files.",
-                        file=stderr
-                    )
-                else:
-                    metadata_csv.writerow(record)
-
-                    print(f">{sequence_id}", file=fasta_output)
-                    print(f"{sequence}" , file= fasta_output)
diff --git a/ingest/workflow/snakemake_rules/sort.smk b/ingest/workflow/snakemake_rules/sort.smk
index fda5046..e5a0eb1 100644
--- a/ingest/workflow/snakemake_rules/sort.smk
+++ b/ingest/workflow/snakemake_rules/sort.smk
@@ -27,7 +27,7 @@ rule sort:
 
 rule metadata:
     input:
-        metadata = rules.transform.output.metadata,
+        metadata = rules.subset_metadata.output.subset_metadata,
         sequences = "data/{type}/sequences.fasta"
     output:
         metadata = "data/{type}/metadata_raw.tsv"
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
index 849b8dc..c48f66a 100644
--- a/ingest/workflow/snakemake_rules/transform.smk
+++ b/ingest/workflow/snakemake_rules/transform.smk
@@ -42,7 +42,7 @@ rule transform:
         all_geolocation_rules = "data/all-geolocation-rules.tsv",
         annotations = config['transform']['annotations'],
     output:
-        metadata = "data/metadata.tsv",
+        metadata = "data/curated_metadata.tsv",
         sequences = "data/sequences.fasta"
     log:
         "logs/transform.txt"
@@ -60,7 +60,6 @@ rule transform:
         authors_default_value = config['transform']['authors_default_value'],
         abbr_authors_field = config['transform']['abbr_authors_field'],
         annotations_id = config['transform']['annotations_id'],
-        metadata_columns = config['transform']['metadata_columns'],
         id_field = config['transform']['id_field'],
         sequence_field = config['transform']['sequence_field']
     shell:
@@ -90,10 +89,21 @@ rule transform:
             | augur curate apply-record-annotations \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \
-            | ./bin/ndjson-to-tsv-and-fasta \
-                --fasta {output.sequences} \
-                --metadata-columns {params.metadata_columns} \
-                --metadata {output.metadata} \
-                --id-field {params.id_field} \
-                --sequence-field {params.sequence_field} ) 2>> {log}
+                --output-fasta {output.sequences} \
+                --output-metadata {output.metadata} \
+                --output-id-field {params.id_field} \
+                --output-seq-field {params.sequence_field} ) 2>> {log}
         """
+
+rule subset_metadata:
+    input:
+        metadata = "data/curated_metadata.tsv",
+    output:
+        subset_metadata="data/metadata.tsv",
+    params:
+        metadata_fields=",".join(config["transform"]["metadata_columns"]),
+    shell:
+        """
+        tsv-select -H -f {params.metadata_fields} \
+            {input.metadata} > {output.subset_metadata}
+        """
\ No newline at end of file

From 33162d9fbf93ee545e386d5b7aaa5e3f11a4fba0 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 17 Jul 2024 11:06:04 -0400
Subject: [PATCH 3/7] Drop unused and blank metadata columns

---
 ingest/config/config.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index 3cce905..8e5812d 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -70,7 +70,5 @@ transform:
     'date_submitted',
     'sra_accession',
     'abbr_authors',
-    'reverse',
     'authors',
-    'institution'
   ]

From 4d7a99f642c4a4317fdae11e9da97f817b3e2e20 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 17 Jul 2024 11:34:46 -0400
Subject: [PATCH 4/7] Rename `transform` to `curate`

To conform to the pathogen-repo-guide

https://github.com/nextstrain/pathogen-repo-guide/blob/89b3c5dbc9b8f6a009f4a19c3ac56113bc5511ee/ingest/rules/curate.smk#L57
---
 ingest/Snakefile                              |  2 +-
 ingest/config/config.yaml                     |  2 +-
 .../{transform.smk => curate.smk}             | 42 +++++++++----------
 ingest/workflow/snakemake_rules/sort.smk      |  2 +-
 4 files changed, 24 insertions(+), 24 deletions(-)
 rename ingest/workflow/snakemake_rules/{transform.smk => curate.smk} (69%)

diff --git a/ingest/Snakefile b/ingest/Snakefile
index 7589a39..9902536 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -45,7 +45,7 @@ rule all:
 
 include: "workflow/snakemake_rules/fetch_sequences.smk"
 
-include: "workflow/snakemake_rules/transform.smk"
+include: "workflow/snakemake_rules/curate.smk"
 
 if "s3_dst" in config:
     include: "workflow/snakemake_rules/upload.smk"
diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index 8e5812d..bfdb010 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -8,7 +8,7 @@ ncbi_taxon_id:
   general: "11250"
 
 # Params for the transform rulegeneral
-transform:
+curate:
   # Fields to rename.
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/curate.smk
similarity index 69%
rename from ingest/workflow/snakemake_rules/transform.smk
rename to ingest/workflow/snakemake_rules/curate.smk
index c48f66a..b8de48e 100644
--- a/ingest/workflow/snakemake_rules/transform.smk
+++ b/ingest/workflow/snakemake_rules/curate.smk
@@ -16,7 +16,7 @@ rule fetch_general_geolocation_rules:
     output:
         general_geolocation_rules = "data/general-geolocation-rules.tsv"
     params:
-        geolocation_rules_url = config['transform']['geolocation_rules_url']
+        geolocation_rules_url = config['curate']['geolocation_rules_url']
     shell:
         """
         curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
@@ -25,7 +25,7 @@ rule fetch_general_geolocation_rules:
 rule concat_geolocation_rules:
     input:
         general_geolocation_rules = "data/general-geolocation-rules.tsv",
-        local_geolocation_rules = config['transform']['local_geolocation_rules']
+        local_geolocation_rules = config['curate']['local_geolocation_rules']
     output:
         all_geolocation_rules = "data/all-geolocation-rules.tsv"
     shell:
@@ -36,32 +36,32 @@ rule concat_geolocation_rules:
 
 
 
-rule transform:
+rule curate:
     input:
         sequences_ndjson = "data/sequences.ndjson",
         all_geolocation_rules = "data/all-geolocation-rules.tsv",
-        annotations = config['transform']['annotations'],
+        annotations = config['curate']['annotations'],
     output:
         metadata = "data/curated_metadata.tsv",
         sequences = "data/sequences.fasta"
     log:
-        "logs/transform.txt"
+        "logs/curate.txt"
     params:
-        field_map = config['transform']['field_map'],
-        strain_regex = config['transform']['strain_regex'],
-        strain_backup_fields = config['transform']['strain_backup_fields'],
-        date_fields = config['transform']['date_fields'],
-        expected_date_formats = config['transform']['expected_date_formats'],
-        genbank_location_field=config["transform"]["genbank_location_field"],
-        articles = config['transform']['titlecase']['articles'],
-        abbreviations = config['transform']['titlecase']['abbreviations'],
-        titlecase_fields = config['transform']['titlecase']['fields'],
-        authors_field = config['transform']['authors_field'],
-        authors_default_value = config['transform']['authors_default_value'],
-        abbr_authors_field = config['transform']['abbr_authors_field'],
-        annotations_id = config['transform']['annotations_id'],
-        id_field = config['transform']['id_field'],
-        sequence_field = config['transform']['sequence_field']
+        field_map = config['curate']['field_map'],
+        strain_regex = config['curate']['strain_regex'],
+        strain_backup_fields = config['curate']['strain_backup_fields'],
+        date_fields = config['curate']['date_fields'],
+        expected_date_formats = config['curate']['expected_date_formats'],
+        genbank_location_field=config["curate"]["genbank_location_field"],
+        articles = config['curate']['titlecase']['articles'],
+        abbreviations = config['curate']['titlecase']['abbreviations'],
+        titlecase_fields = config['curate']['titlecase']['fields'],
+        authors_field = config['curate']['authors_field'],
+        authors_default_value = config['curate']['authors_default_value'],
+        abbr_authors_field = config['curate']['abbr_authors_field'],
+        annotations_id = config['curate']['annotations_id'],
+        id_field = config['curate']['id_field'],
+        sequence_field = config['curate']['sequence_field']
     shell:
         """
         (cat {input.sequences_ndjson} \
@@ -101,7 +101,7 @@ rule subset_metadata:
     output:
         subset_metadata="data/metadata.tsv",
     params:
-        metadata_fields=",".join(config["transform"]["metadata_columns"]),
+        metadata_fields=",".join(config["curate"]["metadata_columns"]),
     shell:
         """
         tsv-select -H -f {params.metadata_fields} \
diff --git a/ingest/workflow/snakemake_rules/sort.smk b/ingest/workflow/snakemake_rules/sort.smk
index e5a0eb1..9680c02 100644
--- a/ingest/workflow/snakemake_rules/sort.smk
+++ b/ingest/workflow/snakemake_rules/sort.smk
@@ -12,7 +12,7 @@ It produces output files as
 
 rule sort:
     input:
-        sequences = rules.transform.output.sequences
+        sequences = rules.curate.output.sequences
     output:
         "data/a/sequences.fasta",
         "data/b/sequences.fasta"

From fa412c9d0fa842dd0ca91432c5b6c692cdde0d41 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 17 Jul 2024 11:36:51 -0400
Subject: [PATCH 5/7] git subrepo pull (merge) ingest/vendored

subrepo:
  subdir:   "ingest/vendored"
  merged:   "258ab8c"
upstream:
  origin:   "https://github.com/nextstrain/ingest"
  branch:   "main"
  commit:   "258ab8c"
git-subrepo:
  version:  "0.4.6"
  origin:   "https://github.com/ingydotnet/git-subrepo"
  commit:   "110b9eb"
---
 ingest/vendored/.cramrc                       |   3 -
 ingest/vendored/.github/dependabot.yml        |  17 ++
 ingest/vendored/.github/workflows/ci.yaml     |  10 +-
 .../.github/workflows/pre-commit.yaml         |  14 ++
 ingest/vendored/.gitrepo                      |   4 +-
 ingest/vendored/.pre-commit-config.yaml       |  40 +++
 ingest/vendored/README.md                     |  47 ++--
 ingest/vendored/apply-geolocation-rules       | 234 ------------------
 ingest/vendored/merge-user-metadata           |  55 ----
 .../transform-strain-names.t                  |  17 --
 ingest/vendored/transform-authors             |  66 -----
 ingest/vendored/transform-field-names         |  48 ----
 ingest/vendored/transform-genbank-location    |  43 ----
 ingest/vendored/transform-strain-names        |  50 ----
 14 files changed, 103 insertions(+), 545 deletions(-)
 delete mode 100644 ingest/vendored/.cramrc
 create mode 100644 ingest/vendored/.github/dependabot.yml
 create mode 100644 ingest/vendored/.github/workflows/pre-commit.yaml
 create mode 100644 ingest/vendored/.pre-commit-config.yaml
 delete mode 100755 ingest/vendored/apply-geolocation-rules
 delete mode 100755 ingest/vendored/merge-user-metadata
 delete mode 100644 ingest/vendored/tests/transform-strain-names/transform-strain-names.t
 delete mode 100755 ingest/vendored/transform-authors
 delete mode 100755 ingest/vendored/transform-field-names
 delete mode 100755 ingest/vendored/transform-genbank-location
 delete mode 100755 ingest/vendored/transform-strain-names

diff --git a/ingest/vendored/.cramrc b/ingest/vendored/.cramrc
deleted file mode 100644
index 153d20f..0000000
--- a/ingest/vendored/.cramrc
+++ /dev/null
@@ -1,3 +0,0 @@
-[cram]
-shell = /bin/bash
-indent = 2
diff --git a/ingest/vendored/.github/dependabot.yml b/ingest/vendored/.github/dependabot.yml
new file mode 100644
index 0000000..89bd084
--- /dev/null
+++ b/ingest/vendored/.github/dependabot.yml
@@ -0,0 +1,17 @@
+# Dependabot configuration file
+# <https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file>
+#
+# Each ecosystem is checked on a scheduled interval defined below.  To trigger
+# a check manually, go to
+#
+#   https://github.com/nextstrain/ingest/network/updates
+#
+# and look for a "Check for updates" button.  You may need to click around a
+# bit first.
+---
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml
index c6a218a..c716277 100644
--- a/ingest/vendored/.github/workflows/ci.yaml
+++ b/ingest/vendored/.github/workflows/ci.yaml
@@ -11,13 +11,5 @@ jobs:
   shellcheck:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: nextstrain/.github/actions/shellcheck@master
-
-  cram:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-      - run: pip install cram
-      - run: cram tests/
\ No newline at end of file
diff --git a/ingest/vendored/.github/workflows/pre-commit.yaml b/ingest/vendored/.github/workflows/pre-commit.yaml
new file mode 100644
index 0000000..70da533
--- /dev/null
+++ b/ingest/vendored/.github/workflows/pre-commit.yaml
@@ -0,0 +1,14 @@
+name: pre-commit
+
+on:
+  - push
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - uses: pre-commit/action@v3.0.1
diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo
index d0ee1d3..d1afbca 100644
--- a/ingest/vendored/.gitrepo
+++ b/ingest/vendored/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = https://github.com/nextstrain/ingest
 	branch = main
-	commit = 7617c39fae05e5882c5e6c065c5b47d500c998af
-	parent = 7a69d8383d5729f3d5aa873a194adb223aa8045e
+	commit = 258ab8ce898a88089bc88caee336f8d683a0e79a
+	parent = 5690058759d693517833af6d9e5c854190164ee8
 	method = merge
 	cmdver = 0.4.6
diff --git a/ingest/vendored/.pre-commit-config.yaml b/ingest/vendored/.pre-commit-config.yaml
new file mode 100644
index 0000000..2cdf88b
--- /dev/null
+++ b/ingest/vendored/.pre-commit-config.yaml
@@ -0,0 +1,40 @@
+default_language_version:
+  python: python3
+repos:
+  - repo: https://github.com/pre-commit/sync-pre-commit-deps
+    rev: v0.0.1
+    hooks:
+      - id: sync-pre-commit-deps
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.10.0.1
+    hooks:
+      - id: shellcheck
+  - repo: https://github.com/rhysd/actionlint
+    rev: v1.6.27
+    hooks:
+      - id: actionlint
+        entry: env SHELLCHECK_OPTS='--exclude=SC2027' actionlint
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-ast
+      - id: check-case-conflict
+      - id: check-docstring-first
+      - id: check-json
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-symlinks
+      - id: check-toml
+      - id: check-yaml
+      - id: destroyed-symlinks
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: fix-byte-order-marker
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.4.6
+    hooks:
+      # Run the linter.
+      - id: ruff
diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md
index fa91891..a2b54cb 100644
--- a/ingest/vendored/README.md
+++ b/ingest/vendored/README.md
@@ -2,7 +2,7 @@
 
 Shared internal tooling for pathogen data ingest.  Used by our individual
 pathogen repos which produce Nextstrain builds.  Expected to be vendored by
-each pathogen repo using `git subtree`.
+each pathogen repo using `git subrepo`.
 
 Some tools may only live here temporarily before finding a permanent home in
 `augur curate` or Nextstrain CLI.  Others may happily live out their days here.
@@ -12,6 +12,9 @@ Some tools may only live here temporarily before finding a permanent home in
 Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor ingest scripts.
 (See discussion on this decision in https://github.com/nextstrain/ingest/issues/3)
 
+For a list of Nextstrain repos that are currently using this method, use [this
+GitHub code search](https://github.com/search?type=code&q=org%3Anextstrain+subrepo+%22remote+%3D+https%3A%2F%2Fgithub.com%2Fnextstrain%2Fingest%22).
+
 If you don't already have `git subrepo` installed, follow the [git subrepo installation instructions](https://github.com/ingydotnet/git-subrepo#installation).
 Then add the latest ingest scripts to the pathogen repo by running:
 
@@ -54,14 +57,14 @@ commit hash if needed.
 
 Much of this tooling originated in
 [ncov-ingest](https://github.com/nextstrain/ncov-ingest) and was passaged thru
-[monkeypox's ingest/](https://github.com/nextstrain/monkeypox/tree/@/ingest/).
-It subsequently proliferated from [monkeypox][] to other pathogen repos
-([rsv][], [zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily
-thru copying.  To [counter that
+[mpox's ingest/](https://github.com/nextstrain/mpox/tree/@/ingest/). It
+subsequently proliferated from [mpox][] to other pathogen repos ([rsv][],
+[zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily thru
+copying.  To [counter that
 proliferation](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079),
 this repo was made.
 
-[monkeypox]: https://github.com/nextstrain/monkeypox
+[mpox]: https://github.com/nextstrain/mpox
 [rsv]: https://github.com/nextstrain/rsv
 [zika]: https://github.com/nextstrain/zika/pull/24
 [dengue]: https://github.com/nextstrain/dengue/pull/10
@@ -114,15 +117,6 @@ Potential Nextstrain CLI scripts
 - [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL.
   Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`.
 
-Potential augur curate scripts
-
-- [apply-geolocation-rules](apply-geolocation-rules) - Applies user curated geolocation rules to NDJSON records
-- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records
-- [transform-authors](transform-authors) - Abbreviates full author lists to '<first author> et al.'
-- [transform-field-names](transform-field-names) - Rename fields of NDJSON records
-- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"<country_value>[:<region>][, <locality>]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/)
-- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields.
-
 ## Software requirements
 
 Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date.
@@ -131,7 +125,24 @@ Some scripts may require Bash ≥4. If you are running these scripts on macOS, t
 
 Most scripts are untested within this repo, relying on "testing in production". That is the only practical testing option for some scripts such as the ones interacting with S3 and Slack.
 
-For more locally testable scripts, Cram-style functional tests live in `tests` and are run as part of CI. To run these locally,
+## Working on this repo
+
+This repo is configured to use [pre-commit](https://pre-commit.com),
+to help automatically catch common coding errors and syntax issues
+with changes before they are committed to the repo.
+
+If you will be writing new code or otherwise working within this repo,
+please do the following to get started:
+
+1. [install `pre-commit`](https://pre-commit.com/#install) by running
+   either `python -m pip install pre-commit` or `brew install
+   pre-commit`, depending on your preferred package management
+   solution
+2. install the local git hooks by running `pre-commit install` from
+   the root of the repo
+3. when problems are detected, correct them in your local working tree
+   before committing them.
 
-1. Download Cram: `pip install cram`
-2. Run the tests: `cram tests/`
+Note that these pre-commit checks are also run in a GitHub Action when
+changes are pushed to GitHub, so correcting issues locally will
+prevent extra cycles of correction.
diff --git a/ingest/vendored/apply-geolocation-rules b/ingest/vendored/apply-geolocation-rules
deleted file mode 100755
index 776cf16..0000000
--- a/ingest/vendored/apply-geolocation-rules
+++ /dev/null
@@ -1,234 +0,0 @@
-#!/usr/bin/env python3
-"""
-Applies user curated geolocation rules to the geolocation fields in the NDJSON
-records from stdin. The modified records are output to stdout. This does not do
-any additional transformations on top of the user curations.
-"""
-import argparse
-import json
-from collections import defaultdict
-from sys import exit, stderr, stdin, stdout
-
-
-class CyclicGeolocationRulesError(Exception):
-    pass
-
-
-def load_geolocation_rules(geolocation_rules_file):
-    """
-    Loads the geolocation rules from the provided *geolocation_rules_file*.
-    Returns the rules as a dict:
-    {
-        regions: {
-            countries: {
-                divisions: {
-                    locations: corrected_geolocations_tuple
-                }
-            }
-        }
-    }
-    """
-    geolocation_rules = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
-    with open(geolocation_rules_file, 'r') as rules_fh:
-        for line in rules_fh:
-            # ignore comments
-            if line.strip()=="" or line.lstrip()[0] == '#':
-                continue
-
-            row = line.strip('\n').split('\t')
-            # Skip lines that cannot be split into raw and annotated geolocations
-            if len(row) != 2:
-                print(
-                    f"WARNING: Could not decode geolocation rule {line!r}.",
-                    "Please make sure rules are formatted as",
-                    "'region/country/division/location<tab>region/country/division/location'.",
-                    file=stderr)
-                continue
-
-            # remove trailing comments
-            row[-1] = row[-1].partition('#')[0].rstrip()
-            raw , annot = tuple( row[0].split('/') ) , tuple( row[1].split('/') )
-
-            # Skip lines where raw or annotated geolocations cannot be split into 4 fields
-            if len(raw) != 4:
-                print(
-                    f"WARNING: Could not decode the raw geolocation {row[0]!r}.",
-                    "Please make sure it is formatted as 'region/country/division/location'.",
-                    file=stderr
-                )
-                continue
-
-            if len(annot) != 4:
-                print(
-                    f"WARNING: Could not decode the annotated geolocation {row[1]!r}.",
-                    "Please make sure it is formatted as 'region/country/division/location'.",
-                    file=stderr
-                )
-                continue
-
-
-            geolocation_rules[raw[0]][raw[1]][raw[2]][raw[3]] = annot
-
-    return geolocation_rules
-
-
-def get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal = None):
-    """
-    Gets the annotated geolocation for the *raw_geolocation* in the provided
-    *geolocation_rules*.
-
-    Recursively traverses the *geolocation_rules* until we get the annotated
-    geolocation, which must be a Tuple. Returns `None` if there are no
-    applicable rules for the provided *raw_geolocation*.
-
-    Rules are applied in the order of region, country, division, location.
-    First checks the provided raw values for geolocation fields, then if there
-    are not matches, tries to use general rules marked with '*'.
-    """
-    # Always instantiate the rule traversal as an empty list if not provided,
-    # e.g. the first call of this recursive function
-    if rule_traversal is None:
-        rule_traversal = []
-
-    current_rules = geolocation_rules
-    # Traverse the geolocation rules based using the rule_traversal values
-    for field_value in rule_traversal:
-        current_rules = current_rules.get(field_value)
-        # If we hit `None`, then we know there are no matching rules, so stop the rule traversal
-        if current_rules is None:
-            break
-
-    # We've found the tuple of the annotated geolocation
-    if isinstance(current_rules, tuple):
-        return current_rules
-
-    # We've reach the next level of geolocation rules,
-    # so try to traverse the rules with the next target in raw_geolocation
-    if isinstance(current_rules, dict):
-        next_traversal_target = raw_geolocation[len(rule_traversal)]
-        rule_traversal.append(next_traversal_target)
-        return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal)
-
-    # We did not find any matching rule for the last traversal target
-    if current_rules is None:
-        # If we've used all general rules and we still haven't found a match,
-        # then there are no applicable rules for this geolocation
-        if all(value == '*' for value in rule_traversal):
-            return None
-
-        # If we failed to find matching rule with a general rule as the last
-        # traversal target, then delete all trailing '*'s to reset rule_traversal
-        # to end with the last index that is currently NOT a '*'
-        # [A, *, B, *] => [A, *, B]
-        # [A, B, *, *] => [A, B]
-        # [A, *, *, *] => [A]
-        if rule_traversal[-1] == '*':
-            # Find the index of the first of the consecutive '*' from the
-            # end of the rule_traversal
-            # [A, *, B, *] => first_consecutive_general_rule_index = 3
-            # [A, B, *, *] => first_consecutive_general_rule_index = 2
-            # [A, *, *, *] => first_consecutive_general_rule_index = 1
-            for index, field_value in reversed(list(enumerate(rule_traversal))):
-                if field_value == '*':
-                    first_consecutive_general_rule_index = index
-                else:
-                    break
-
-            rule_traversal = rule_traversal[:first_consecutive_general_rule_index]
-
-        # Set the final value to '*' in hopes that by moving to a general rule,
-        # we can find a matching rule.
-        rule_traversal[-1] = '*'
-
-        return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal)
-
-
-def transform_geolocations(geolocation_rules, geolocation):
-    """
-    Transform the provided *geolocation* by looking it up in the provided
-    *geolocation_rules*.
-
-    This will use all rules that apply to the geolocation and rules will
-    be applied in the order of region, country, division, location.
-
-    Returns the original geolocation if no geolocation rules apply.
-
-    Raises a `CyclicGeolocationRulesError` if more than 1000 rules have
-    been applied to the raw geolocation.
-    """
-    transformed_values = geolocation
-    rules_applied = 0
-    continue_to_apply = True
-
-    while continue_to_apply:
-        annotated_values = get_annotated_geolocation(geolocation_rules, transformed_values)
-
-        # Stop applying rules if no annotated values were found
-        if annotated_values is None:
-            continue_to_apply = False
-        else:
-            rules_applied += 1
-
-            if rules_applied > 1000:
-                raise CyclicGeolocationRulesError(
-                    "ERROR: More than 1000 geolocation rules applied on the same entry {geolocation!r}."
-                )
-
-            # Create a new list of values for comparison to previous values
-            new_values = list(transformed_values)
-            for index, value in enumerate(annotated_values):
-                # Keep original value if annotated value is '*'
-                if value != '*':
-                    new_values[index] = value
-
-            # Stop applying rules if this rule did not change the values,
-            # since this means we've reach rules with '*' that no longer change values
-            if new_values == transformed_values:
-                continue_to_apply = False
-
-            transformed_values = new_values
-
-    return transformed_values
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--region-field", default="region",
-        help="Field that contains regions in NDJSON records.")
-    parser.add_argument("--country-field", default="country",
-        help="Field that contains countries in NDJSON records.")
-    parser.add_argument("--division-field", default="division",
-        help="Field that contains divisions in NDJSON records.")
-    parser.add_argument("--location-field", default="location",
-        help="Field that contains location in NDJSON records.")
-    parser.add_argument("--geolocation-rules", metavar="TSV", required=True,
-        help="TSV file of geolocation rules with the format: " +
-             "'<raw_geolocation><tab><annotated_geolocation>' where the raw and annotated geolocations " +
-             "are formatted as '<region>/<country>/<division>/<location>'. " +
-             "If creating a general rule, then the raw field value can be substituted with '*'." +
-             "Lines starting with '#' will be ignored as comments." +
-             "Trailing '#' will be ignored as comments.")
-
-    args = parser.parse_args()
-
-    location_fields = [args.region_field, args.country_field, args.division_field, args.location_field]
-
-    geolocation_rules = load_geolocation_rules(args.geolocation_rules)
-
-    for record in stdin:
-        record = json.loads(record)
-
-        try:
-            annotated_values = transform_geolocations(geolocation_rules, [record.get(field, '') for field in location_fields])
-        except CyclicGeolocationRulesError as e:
-            print(e, file=stderr)
-            exit(1)
-
-        for index, field in enumerate(location_fields):
-            record[field] = annotated_values[index]
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/vendored/merge-user-metadata b/ingest/vendored/merge-user-metadata
deleted file mode 100755
index 341c2df..0000000
--- a/ingest/vendored/merge-user-metadata
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-"""
-Merges user curated annotations with the NDJSON records from stdin, with the user
-curations overwriting the existing fields. The modified records are output
-to stdout. This does not do any additional transformations on top of the user
-curations.
-"""
-import argparse
-import csv
-import json
-from collections import defaultdict
-from sys import exit, stdin, stderr, stdout
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--annotations", metavar="TSV", required=True,
-        help="Manually curated annotations TSV file. " +
-             "The TSV should not have a header and should have exactly three columns: " +
-             "id to match existing metadata, field name, and field value. " +
-             "If there are multiple annotations for the same id and field, then the last value is used. " +
-             "Lines starting with '#' are treated as comments. " +
-             "Any '#' after the field value are treated as comments.")
-    parser.add_argument("--id-field", default="accession",
-        help="The ID field in the metadata to use to merge with the annotations.")
-
-    args = parser.parse_args()
-
-    annotations = defaultdict(dict)
-    with open(args.annotations, 'r') as annotations_fh:
-        csv_reader = csv.reader(annotations_fh, delimiter='\t')
-        for row in csv_reader:
-            if not row or row[0].lstrip()[0] == '#':
-                    continue
-            elif len(row) != 3:
-                print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr)
-                continue
-            id, field, value = row
-            annotations[id][field] = value.partition('#')[0].rstrip()
-
-    for record in stdin:
-        record = json.loads(record)
-
-        record_id = record.get(args.id_field)
-        if record_id is None:
-            print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr)
-            exit(1)
-
-        record.update(annotations.get(record_id, {}))
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/vendored/tests/transform-strain-names/transform-strain-names.t b/ingest/vendored/tests/transform-strain-names/transform-strain-names.t
deleted file mode 100644
index 1c05df7..0000000
--- a/ingest/vendored/tests/transform-strain-names/transform-strain-names.t
+++ /dev/null
@@ -1,17 +0,0 @@
-Look for strain name in "strain" or a list of backup fields.
-
-If strain entry exists, do not do anything.
-
-  $ echo '{"strain": "i/am/a/strain", "strain_s": "other"}' \
-  >   | $TESTDIR/../../transform-strain-names \
-  >       --strain-regex '^.+$' \
-  >       --backup-fields strain_s accession
-  {"strain":"i/am/a/strain","strain_s":"other"}
-
-If strain entry does not exists, search the backup fields
-
-  $ echo '{"strain_s": "other"}' \
-  >   | $TESTDIR/../../transform-strain-names \
-  >       --strain-regex '^.+$' \
-  >       --backup-fields accession strain_s 
-  {"strain_s":"other","strain":"other"}
\ No newline at end of file
diff --git a/ingest/vendored/transform-authors b/ingest/vendored/transform-authors
deleted file mode 100755
index 0bade20..0000000
--- a/ingest/vendored/transform-authors
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-"""
-Abbreviates a full list of authors to be '<first author> et al.' of the NDJSON
-record from stdin and outputs modified records to stdout.
-
-Note: This is a "best effort" approach and can potentially mangle the author name.
-"""
-import argparse
-import json
-import re
-from sys import stderr, stdin, stdout
-
-
-def parse_authors(record: dict, authors_field: str, default_value: str,
-    index: int, abbr_authors_field: str = None) -> dict:
-    # Strip and normalize whitespace
-    new_authors = re.sub(r'\s+', ' ', record[authors_field])
-
-    if new_authors == "":
-        new_authors = default_value
-    else:
-        # Split authors list on comma/semicolon
-        # OR "and"/"&" with at least one space before and after
-        new_authors = re.split(r'(?:\s*[,，;；]\s*|\s+(?:and|&)\s+)', new_authors)[0]
-
-        # if it does not already end with " et al.", add it
-        if not new_authors.strip('. ').endswith(" et al"):
-            new_authors += ' et al'
-
-    if abbr_authors_field:
-        if record.get(abbr_authors_field):
-            print(
-                f"WARNING: the {abbr_authors_field!r} field already exists",
-                f"in record {index} and will be overwritten!",
-                file=stderr
-            )
-
-        record[abbr_authors_field] = new_authors
-    else:
-        record[authors_field] = new_authors
-
-    return record
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--authors-field", default="authors",
-        help="The field containing list of authors.")
-    parser.add_argument("--default-value", default="?",
-        help="Default value to use if authors list is empty.")
-    parser.add_argument("--abbr-authors-field",
-        help="The field for the generated abbreviated authors. " +
-             "If not provided, the original authors field will be modified.")
-
-    args = parser.parse_args()
-
-    for index, record in enumerate(stdin):
-        record = json.loads(record)
-
-        parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field)
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/vendored/transform-field-names b/ingest/vendored/transform-field-names
deleted file mode 100755
index fde223f..0000000
--- a/ingest/vendored/transform-field-names
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python3
-"""
-Renames fields of the NDJSON record from stdin and outputs modified records
-to stdout.
-"""
-import argparse
-import json
-from sys import stderr, stdin, stdout
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--field-map", nargs="+",
-        help="Fields names in the NDJSON record mapped to new field names, " +
-             "formatted as '{old_field_name}={new_field_name}'. " +
-             "If the old field does not exist in record, the new field will be added with an empty string value." +
-             "If the new field already exists in record, then the renaming of the old field will be skipped.")
-    parser.add_argument("--force", action="store_true",
-        help="Force renaming of old field even if the new field already exists. " +
-             "Please keep in mind this will overwrite the value of the new field.")
-
-    args = parser.parse_args()
-
-    field_map = {}
-    for field in args.field_map:
-        old_name, new_name = field.split('=')
-        field_map[old_name] = new_name
-
-    for record in stdin:
-        record = json.loads(record)
-
-        for old_field, new_field in field_map.items():
-
-            if record.get(new_field) and not args.force:
-                print(
-                    f"WARNING: skipping rename of {old_field} because record",
-                    f"already has a field named {new_field}.",
-                    file=stderr
-                )
-                continue
-
-            record[new_field] = record.pop(old_field, '')
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/vendored/transform-genbank-location b/ingest/vendored/transform-genbank-location
deleted file mode 100755
index 70ba56f..0000000
--- a/ingest/vendored/transform-genbank-location
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate
-fields: 'country', 'division', and 'location'. Checks that a record is from
-GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq".
-
-Outputs the modified record to stdout.
-"""
-import json
-from sys import stdin, stdout
-
-
-def parse_location(record: dict) -> dict:
-    # Expected pattern for the location field is "<country_value>[:<region>][, <locality>]"
-    # See GenBank docs for their "country" field:
-    # https://www.ncbi.nlm.nih.gov/genbank/collab/country/
-    geographic_data = record['location'].split(':')
-
-    country = geographic_data[0]
-    division = ''
-    location = ''
-
-    if len(geographic_data) == 2:
-        division , _ , location = geographic_data[1].partition(',')
-
-    record['country'] = country.strip()
-    record['division'] = division.strip()
-    record['location'] = location.strip()
-
-    return record
-
-
-if __name__ == '__main__':
-
-    for record in stdin:
-        record = json.loads(record)
-
-        database = record.get('database', '')
-        if database in {'GenBank', 'RefSeq'}:
-            parse_location(record)
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/vendored/transform-strain-names b/ingest/vendored/transform-strain-names
deleted file mode 100755
index d86c0e4..0000000
--- a/ingest/vendored/transform-strain-names
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python3
-"""
-Verifies strain name pattern in the 'strain' field of the NDJSON record from
-stdin. Adds a 'strain' field to the record if it does not already exist.
-
-Outputs the modified records to stdout.
-"""
-import argparse
-import json
-import re
-from sys import stderr, stdin, stdout
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--strain-regex", default="^.+$",
-        help="Regex pattern for strain names. " +
-             "Strain names that do not match the pattern will be dropped.")
-    parser.add_argument("--backup-fields", nargs="*",
-        help="List of backup fields to use as strain name if the value in 'strain' " +
-             "does not match the strain regex pattern. " +
-             "If multiple fields are provided, will use the first field that has a non-empty string.")
-
-    args = parser.parse_args()
-
-    strain_name_pattern = re.compile(args.strain_regex)
-
-    for index, record in enumerate(stdin):
-        record = json.loads(record)
-
-        # Verify strain name matches the strain regex pattern
-        if strain_name_pattern.match(record.get('strain', '')) is None:
-            # Default to empty string if not matching pattern
-            record['strain'] = ''
-            # Use non-empty value of backup fields if provided
-            if args.backup_fields:
-                for field in args.backup_fields:
-                    if record.get(field):
-                        record['strain'] = str(record[field])
-                        break
-
-        if record['strain'] == '':
-            print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr)
-
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()

From 644faa9e3e910f4e4a63c7de04952674a081a2d1 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Thu, 18 Jul 2024 11:22:31 -0400
Subject: [PATCH 6/7] Add functional institution column

---
 ingest/config/config.yaml                     | 1 +
 ingest/source-data/ncbi-dataset-field-map.tsv | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index bfdb010..bcfe8a5 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -71,4 +71,5 @@ curate:
     'sra_accession',
     'abbr_authors',
     'authors',
+    'institution',
   ]
diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
index eb79418..182b977 100644
--- a/ingest/source-data/ncbi-dataset-field-map.tsv
+++ b/ingest/source-data/ncbi-dataset-field-map.tsv
@@ -14,4 +14,4 @@ BioProjects	bioproject_accession
 BioSample accession	biosample_accession
 SRA Accessions	sra_accession
 Submitter Names	authors
-Submitter Affiliation	submitting_organization
+Submitter Affiliation	institution

From cfa435dc157d1c4f8a40f23f1d1c9b21814c7b78 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Thu, 18 Jul 2024 12:36:12 -0400
Subject: [PATCH 7/7] Fixup: Rename `transform` to `curate`

---
 ingest/config/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index bcfe8a5..de18a1b 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -7,7 +7,7 @@ ncbi_taxon_id:
   b: "208895"
   general: "11250"
 
-# Params for the transform rulegeneral
+# Params for the curate rule
 curate:
   # Fields to rename.
   # This is the first step in the pipeline, so any references to field names