Skip to content

Commit

Permalink
[fixup dengue copy] refactor: move post_process_metadata to rule tran…
Browse files Browse the repository at this point in the history
…sform

To simplify the workflow, instead of post processing metadata to clean up
strain names and set dengue serotype based on virus lineage ID after the
transform step, incorporate post processing directly into the transform step.
This step was moved above any manual annotations. This also simplified the
code so we were not having two code blocks determining the final metadata columns
which may have become inconsistent.
  • Loading branch information
j23414 committed Jul 3, 2023
1 parent a7e9d67 commit d503fed
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 54 deletions.
50 changes: 11 additions & 39 deletions ingest/bin/post_process_metadata.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,13 @@
#! /usr/bin/env python3

import argparse
import os
import sys

import numpy as np
import pandas as pd

import json
from sys import stdin, stdout

def parse_args():
parser = argparse.ArgumentParser(
description="Reformat a NCBI Virus metadata.tsv file for a pathogen build."
)
parser.add_argument(
"--metadata", help="NCBI Virus metadata.tsv file.", required=True
)
parser.add_argument(
"--outfile",
help="Output file name, e.g. processed_metadata.tsv.",
required=True,
)

return parser.parse_args()

Expand All @@ -44,7 +32,7 @@ def _set_url(record):

def _set_paper_url(record):
"""Set paper_url from a comma separate list of PubMed IDs in publication. Only use the first ID."""
if pd.isna(record["publications"]):
if (not record["publications"]):
return ""

return (
Expand All @@ -55,30 +43,14 @@ def _set_paper_url(record):

def main():
args = parse_args()
df = pd.read_csv(args.metadata, sep="\t", header=0)

df["strain"] = df.apply(_set_strain_name, axis=1)
df["url"] = df.apply(_set_url, axis=1)
df["paper_url"] = df.apply(_set_paper_url, axis=1)
df["authors"] = df["abbr_authors"]
df["city"] = df["location"]

METADATA_COLUMNS = [
"strain",
"accession",
"genbank_accession_rev",
"date",
"updated",
"region",
"country",
"division",
"city",
"authors",
"url",
"title",
"paper_url",
]
df.to_csv(args.outfile, sep="\t", index=False, columns=METADATA_COLUMNS)

for index, record in enumerate(stdin):
record = json.loads(record)
record["strain"] = _set_strain_name(record)
record["url"] = _set_url(record)
record["paper_url"] = _set_paper_url(record)
record["authors"] = record["abbr_authors"]
stdout.write(json.dumps(record) + "\n")


if __name__ == "__main__":
Expand Down
7 changes: 3 additions & 4 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ transform:
'accession',
'genbank_accession_rev',
'strain',
'strain_s',
'viruslineage_ids',
'date',
'updated',
'region',
Expand All @@ -64,10 +62,11 @@ transform:
'host',
'date_submitted',
'sra_accession',
'abbr_authors',
'reverse',
'authors',
'institution',
'title',
'publications'
'journal',
'url',
'paper_url'
]
13 changes: 2 additions & 11 deletions ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ rule transform:
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["transform"]["annotations"],
output:
metadata="data/raw_metadata_{serotype}.tsv",
metadata="data/metadata_{serotype}.tsv",
sequences="data/sequences_{serotype}.fasta",
log:
"logs/transform_{serotype}.txt",
Expand Down Expand Up @@ -85,6 +85,7 @@ rule transform:
--abbr-authors-field {params.abbr_authors_field} \
| ./bin/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/post_process_metadata.py \
| ./bin/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand All @@ -96,16 +97,6 @@ rule transform:
--sequence-field {params.sequence_field} ) 2>> {log}
"""

rule post_process_metadata:
input:
metadata="data/raw_metadata_{serotype}.tsv",
output:
metadata="data/metadata_{serotype}.tsv",
shell:
"""
./bin/post_process_metadata.py --metadata {input.metadata} --outfile {output.metadata}
"""


rule compress:
input:
Expand Down

0 comments on commit d503fed

Please sign in to comment.