From 6c206bcfea5ae2b008d5160080e3187e63f31fd4 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Mon, 5 Feb 2024 23:38:17 -0800
Subject: [PATCH] Split by serotype using NCBI virus_tax_id

1. Pull virus_tax_id and virus_name from the dengue NCBI dataset
2. Use the virus_tax_id to infer the ncbi_serotype of each record
3. Use the ncbi_serotype field to split sequences and metadata files into serotypes
4. Update target files in the Snakefile to reflect the new serotype split

Co-authored-by: Jover Lee <joverlee521@gmail.com>
---
 ingest/Snakefile                              |  5 +-
 ingest/bin/infer-dengue-serotype.py           | 52 +++++++++++++++++++
 ingest/config/config.yaml                     |  5 ++
 .../snakemake_rules/split_serotypes.smk       | 37 +++++++++++++
 ingest/workflow/snakemake_rules/transform.smk |  9 ++--
 5 files changed, 103 insertions(+), 5 deletions(-)
 create mode 100755 ingest/bin/infer-dengue-serotype.py
 create mode 100644 ingest/workflow/snakemake_rules/split_serotypes.smk

diff --git a/ingest/Snakefile b/ingest/Snakefile
index bfc99b0d..f8dc8833 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -11,10 +11,12 @@ if not config:
 
 send_slack_notifications = config.get("send_slack_notifications", False)
 
+serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']
 
 def _get_all_targets(wildcards):
     # Default targets are the metadata TSV and sequences FASTA files
-    all_targets = ["results/sequences.fasta", "results/metadata.tsv"]
+    all_targets = expand(["results/sequences_{serotype}.fasta", "results/metadata_{serotype}.tsv"], serotype=serotypes)
+
 
     # Add additional targets based on upload config
     upload_config = config.get("upload", {})
@@ -57,6 +59,7 @@ rule all:
 
 include: "workflow/snakemake_rules/fetch_sequences.smk"
 include: "workflow/snakemake_rules/transform.smk"
+include: "workflow/snakemake_rules/split_serotypes.smk"
 
 
 if config.get("upload", False):
diff --git a/ingest/bin/infer-dengue-serotype.py b/ingest/bin/infer-dengue-serotype.py
new file mode 100755
index 00000000..fe32996d
--- /dev/null
+++ b/ingest/bin/infer-dengue-serotype.py
@@ -0,0 +1,52 @@
+#! /usr/bin/env python3
+
+import argparse
+import json
+from sys import stdin, stdout
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Dengue specific processing of metadata, infer serotype from virus_tax_id",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--virus-tax-id",
+        type=str,
+        default="virus_tax_id",
+        help="Column name containing the NCBI taxon id of the virus serotype.",
+    )
+    parser.add_argument(
+        "--out-col",
+        type=str,
+        default="ncbi_serotype",
+        help="Column name to store the inferred serotype.",
+    )
+    return parser.parse_args()
+
+
+def _get_dengue_serotype(record, col="virus_tax_id"):
+    """Set dengue serotype from virus_tax_id"""
+    dengue_types = {
+        "11053": "denv1",
+        "11060": "denv2",
+        "11069": "denv3",
+        "11070": "denv4",
+        "31634": "denv2", # Dengue virus 2 Thailand/16681/84
+    }
+
+    taxon_id = record[col]
+
+    return dengue_types.get(taxon_id, "")
+
+
+def main():
+    args = parse_args()
+
+    for record in stdin:
+        record = json.loads(record)
+        record[args.out_col] = _get_dengue_serotype(record, col=args.virus_tax_id)
+        stdout.write(json.dumps(record) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index 231a5bb2..3f08a9ed 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -15,6 +15,8 @@ ncbi_datasets_fields:
   - isolate-collection-date
   - release-date
   - update-date
+  - virus-tax-id
+  - virus-name
   - length
   - host-name
   - isolate-lineage-source
@@ -37,6 +39,8 @@ transform:
     'isolate-collection-date=date',
     'release-date=release_date',
     'update-date=update_date',
+    'virus-tax-id=virus_tax_id',
+    'virus-name=virus_name',
     'sra-accs=sra_accessions',    
     'submitter-names=authors',
     'submitter-affiliation=institution',
@@ -96,6 +100,7 @@ transform:
     'host',
     'release_date',
     'update_date',
+    'ncbi_serotype', # inferred from virus_tax_id
     'sra_accessions',
     'abbr_authors',
     'authors',
diff --git a/ingest/workflow/snakemake_rules/split_serotypes.smk b/ingest/workflow/snakemake_rules/split_serotypes.smk
new file mode 100644
index 00000000..e3c92ef7
--- /dev/null
+++ b/ingest/workflow/snakemake_rules/split_serotypes.smk
@@ -0,0 +1,37 @@
+"""
+This part of the workflow handles splitting the data by serotype either based on the 
+NCBI metadata or Nextclade dataset. Could use both if necessary to cross-validate.
+
+    metadata = "results/metadata_all.tsv"
+    sequences = "results/sequences_all.fasta"
+
+This will produce output files as
+
+    metadata_{serotype} = "results/metadata_{serotype}.tsv"
+    sequences_{serotype} = "results/sequences_{serotype}.fasta"
+
+Parameters are expected to be defined in `config.transform`.
+"""
+
+rule split_by_ncbi_serotype:
+    """
+    Split the data by serotype based on the NCBI metadata.
+    """
+    input:
+        metadata = "results/metadata_all.tsv",
+        sequences = "results/sequences_all.fasta"
+    output:
+        metadata = "results/metadata_{serotype}.tsv",
+        sequences = "results/sequences_{serotype}.fasta"
+    params:
+        id_field = config["transform"]["id_field"]
+    shell:
+        """
+        augur filter \
+          --sequences {input.sequences} \
+          --metadata {input.metadata} \
+          --metadata-id-columns {params.id_field} \
+          --query "ncbi_serotype=='{wildcards.serotype}'" \
+          --output-sequences {output.sequences} \
+          --output-metadata {output.metadata}
+        """
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
index ec63d00f..2af0b883 100644
--- a/ingest/workflow/snakemake_rules/transform.smk
+++ b/ingest/workflow/snakemake_rules/transform.smk
@@ -6,8 +6,8 @@ formats and expects input file
 
 This will produce output files as
 
-    metadata = "results/metadata.tsv"
-    sequences = "results/sequences.fasta"
+    metadata = "results/metadata_all.tsv"
+    sequences = "results/sequences_all.fasta"
 
 Parameters are expected to be defined in `config.transform`.
 """
@@ -42,8 +42,8 @@ rule transform:
         all_geolocation_rules="data/all-geolocation-rules.tsv",
         annotations=config["transform"]["annotations"],
     output:
-        metadata="results/metadata.tsv",
-        sequences="results/sequences.fasta",
+        metadata="results/metadata_all.tsv",
+        sequences="results/sequences_all.fasta",
     log:
         "logs/transform.txt",
     params:
@@ -85,6 +85,7 @@ rule transform:
                 --abbr-authors-field {params.abbr_authors_field} \
             | ./vendored/apply-geolocation-rules \
                 --geolocation-rules {input.all_geolocation_rules} \
+            | ./bin/infer-dengue-serotype.py \
             | ./vendored/merge-user-metadata \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \