[wip] merge rules

nextstrain · Aug 21, 2023 · 0aa0b23 · 0aa0b23
1 parent 7ec737b
commit 0aa0b23
Show file tree

Hide file tree

Showing 7 changed files with 537 additions and 0 deletions.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -69,6 +69,7 @@ rule all:
 
 include: "workflow/snakemake_rules/fetch_sequences.smk"
 include: "workflow/snakemake_rules/transform.smk"
+include: "workflow/snakemake_rules/merge.smk"
 
 
 if config.get("upload"):

diff --git a/ingest/bin/ndjson-to-tsv b/ingest/bin/ndjson-to-tsv
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+"""
+Parses NDJSON records from stdin to two different files: a metadata TSV and a
+sequences FASTA.
+
+Records that do not have an ID or sequence will be excluded from the output files.
+"""
+import argparse
+import csv
+import json
+from sys import stderr, stdin
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--metadata", metavar="TSV", default="data/metadata.tsv",
+        help="The output metadata TSV file")
+    parser.add_argument("--metadata-columns", nargs="+",
+        help="List of fields from the NDJSON records to include as columns in the metadata TSV. " +
+             "Metadata TSV columns will be in the order of the columns provided.")
+    parser.add_argument("--id-field", default='strain',
+        help="Field from the records to use as the sequence ID in the FASTA file.")
+
+    args = parser.parse_args()
+
+    with open(args.metadata, 'wt') as metadata_output:
+        metadata_csv = csv.DictWriter(
+            metadata_output,
+            args.metadata_columns,
+            restval="",
+            extrasaction='ignore',
+            delimiter='\t'
+        )
+        metadata_csv.writeheader()
+
+        for index, record in enumerate(stdin):
+            record = json.loads(record)
+            metadata_csv.writerow(record)
diff --git a/ingest/bin/replace_name.pl b/ingest/bin/replace_name.pl
@@ -0,0 +1,47 @@
+#! /usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $USAGE = "Usage: $0 <lookuptable.tsv> <sequences.fasta>\n";
+$USAGE=$USAGE."  Replaces the names of the sequences in <sequences.fasta> with the names in <lookuptable.tsv>\n";
+
+if (@ARGV != 2) {
+    print $USAGE;
+    exit;
+}
+
+my ($fn1, $fn2) = @ARGV;
+print STDERR "Reading hashmap from $fn1 to rename headers in $fn2\n";
+
+my $fh;
+open ($fh, "<:encoding(UTF-8)", $fn1) or die "Couldn't open $fn1: $!\n";
+
+my %headers;
+my $line;
+
+while(<$fh>) {
+    chomp;
+    $line = $_;
+    my @fields = split(/\t/, $line);
+    $headers{$fields[0]} = $fields[1];
+}
+close($fh);
+
+open ($fh, "<:encoding(UTF-8)", $fn2) or die "Couldn't open $fn2: $!\n";
+
+while(<$fh>) {
+  chomp;
+  if(/>/) {
+    my $header = $_;
+    $header =~ s/>//;
+    if(exists $headers{$header}) {
+      print ">$headers{$header}\n";
+    } else {
+      # print STDERR "Couldn't find $header in lookup table\n";
+      print ">$header\n";
+    }
+  } else {
+    print "$_\n";
+  }
+}
diff --git a/ingest/bin/transform-citations.py b/ingest/bin/transform-citations.py
@@ -0,0 +1,123 @@
+#! /usr/bin/env python
+"""
+Updates the title and journal fields of the NDJSON record from stdin and outputs by making Entrez queries to fetch citations
+"""
+
+import argparse
+import json
+import re
+from sys import stderr, stdin, stdout
+import time
+
+from Bio import SeqIO
+from Bio import Entrez
+import requests
+import warnings
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Use Entrez query to extract citations. Fetch in batches of group size."
+    )
+    parser.add_argument(
+        "--genbank-id-field",
+        default="genbank_accession",
+    )
+    parser.add_argument(
+        "--group-size",
+        default=200,
+        help="Fetch in batches of group size [default:200]",
+        required=False,
+    )
+    parser.add_argument(
+        "--entrez-email",
+        default="hello@nextstrain.org",
+        help="Entrez email address [default:hello@nextstrain.org]",
+        required=False,
+    )
+    return parser.parse_args()
+
+
+def fetch_and_print_citations(data: dict, genbank_id_field: str):
+    genbank_ids = ",".join([d[genbank_id_field] for d in data])
+
+    try:
+        handle = Entrez.efetch(
+            db="nucleotide", id=genbank_ids, rettype="gb", retmode="text", retmax=1000
+        )
+        record = SeqIO.parse(handle, "genbank")
+
+        title_dict = dict()
+        journal_dict = dict()
+        for row in record:
+            title_dict[row.id.split(".")[0]] = row.annotations["references"][0].title
+            journal_dict[row.id.split(".")[0]] = row.annotations["references"][0].journal
+
+        # Maintain the order in data
+        for d in data:
+            if d["genbank_accession"] in title_dict:
+                d["title"] = title_dict[d["genbank_accession"]]
+
+            if d["genbank_accession"] in journal_dict:
+                d["journal"] = journal_dict[d["genbank_accession"]]
+
+            # Always print the record
+            json.dump(d, stdout, allow_nan=False, indent=None, separators=",:")
+            print()
+
+        handle.close()
+    except Exception as exception_msg:
+        for d in data:
+            fetch_one_citation(d, genbank_id_field)
+            time.sleep(1)
+
+    return None
+
+
+def fetch_one_citation(data: dict, genbank_id_field: str):
+    genbank_id = data[genbank_id_field]
+
+    try:
+        handle = Entrez.efetch(
+            db="nucleotide", id=genbank_id, rettype="gb", retmode="text", retmax=1000
+        )
+        record = SeqIO.read(handle, "genbank")
+
+        data["title"] = record.annotations["references"][0].title
+
+        json.dump(data, stdout, allow_nan=False, indent=None, separators=",:")
+        print()
+
+        handle.close()
+
+    except Exception as exception_msg:
+        warnings.warn(f"Pass through and skip title processing for {data[genbank_id_field]}: {exception_msg}", stacklevel=2)
+
+        # example: GenBank ON123563-81 records are in the ndjson but were removed from Entrez
+        json.dump(data, stdout, allow_nan=False, indent=None, separators=",:")
+        print()
+
+    return None
+
+
+def main():
+    args = parse_args()
+
+    data = []
+    chunk_size = int(args.group_size)
+    genbank_id_field = args.genbank_id_field
+
+    Entrez.email = args.entrez_email
+
+    for index, record in enumerate(stdin):
+        data.append(json.loads(record))
+        if (index + 1) % chunk_size == 0:
+            fetch_and_print_citations(data, genbank_id_field)
+            data = []
+
+    if data:
+        fetch_and_print_citations(data, genbank_id_field)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/bin/tsv-to-ndjson b/ingest/bin/tsv-to-ndjson
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+"""
+Copied from "bin/csv-to-ndjson" in nextstrain/ncov-ingest:
+https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/csv-to-ndjson
+
+Convert CSV on stdin to NDJSON on stdout.
+"""
+import csv
+import json
+from sys import stdin, stdout
+
+# 200 MiB; default is 128 KiB
+csv.field_size_limit(200 * 1024 * 1024)
+
+for row in csv.DictReader(stdin, delimiter = '\t'):
+    json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:')
+    print()