Skip to content

Commit

Permalink
[feat] zika_upload script from fauna
Browse files Browse the repository at this point in the history
Rescue some of the original functionality of the zika_upload script from fauna.

https://github.com/nextstrain/fauna/blob/master/vdb/zika_upload.py#L14-L30
  • Loading branch information
j23414 committed Jul 3, 2023
1 parent d503fed commit 8b28622
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 3 deletions.
70 changes: 67 additions & 3 deletions ingest/bin/post_process_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,56 @@
import json
from sys import stdin, stdout

import re
import csv

def parse_args():
parser = argparse.ArgumentParser(
description="Reformat a NCBI Virus metadata.tsv file for a pathogen build."
)
parser.add_argument(
"--strain-fixes",
default="source-data/zika_strain_name_fix.tsv",
help="Path to a TSV file containing strain name fixes."
)

return parser.parse_args()


def _set_strain_name(record):
def define_strain_fixes(fname):
'''
Open strain name fixing files and define corresponding dictionaries
From: https://github.com/nextstrain/fauna/blob/bda9e474e3815490904ee230605f49e532d4d77d/vdb/upload.py#L142-L150
'''
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
fix_whole_name = {}
for line in reader:
fix_whole_name[line['label'].encode().decode('unicode-escape')] = line['fix']
return fix_whole_name

def _set_strain_name(record, fixes):
"""Replace spaces, dashes, and periods with underscores in strain name."""
strain_name = record["strain"]

if strain_name in fixes:
strain_name = fixes[strain_name]

strain_name = strain_name.replace('Zika_virus', '').replace('Zikavirus', '').replace('Zika virus', '').replace('Zika', '').replace('ZIKV', '')
strain_name = strain_name.replace('Human', '').replace('human', '').replace('H.sapiens_wt', '').replace('H.sapiens_tc', '').replace('Hsapiens_tc', '').replace('H.sapiens-tc', '').replace('Homo_sapiens', '').replace('Homo sapiens', '').replace('Hsapiens', '').replace('H.sapiens', '')
strain_name = strain_name.replace('/Hu/', '')
strain_name = strain_name.replace('_Asian', '').replace('_Asia', '').replace('_asian', '').replace('_asia', '')
strain_name = strain_name.replace('_URI', '').replace('_SER', '').replace('_PLA', '').replace('_MOS', '').replace('_SAL', '')
strain_name = strain_name.replace('Aaegypti_wt', 'Aedes_aegypti').replace('Aedessp', 'Aedes_sp')
strain_name = strain_name.replace(' ', '').replace('\'', '').replace('(', '').replace(')', '').replace('//', '/').replace('__', '_').replace('.', '').replace(',', '')
strain_name = re.sub('^[\/\_\-]', '', strain_name)

if strain_name in fixes:
strain_name = fixes[strain_name]

try:
strain_name = 'V' + str(int(strain_name))
except:
pass

return (
strain_name.replace(" ", "_")
.replace("-", "_")
Expand All @@ -24,6 +62,28 @@ def _set_strain_name(record):
.replace(")", "_")
)

def define_location_fixes(fname):
'''
Open location fix file and define corresponding dictionaries
From: https://github.com/nextstrain/fauna/blob/bda9e474e3815490904ee230605f49e532d4d77d/vdb/upload.py#L152-L160
'''
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
fix_location = {}
for line in reader:
fix_location[line['label'].encode().decode('unicode-escape')] = line['fix']
return fix_location

def define_date_fixes(fname):
'''
Open date fix file and define corresponding dictionaries
From: https://github.com/nextstrain/fauna/blob/bda9e474e3815490904ee230605f49e532d4d77d/vdb/upload.py#L162-L170
'''
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
fix_date = {}
for line in reader:
fix_date[line['label'].encode().decode('unicode-escape')] = line['fix']
return fix_date


def _set_url(record):
"""Set url column from accession"""
Expand All @@ -44,9 +104,13 @@ def _set_paper_url(record):
def main():
args = parse_args()

strain_fixes = define_strain_fixes(args.strain_fixes)
# location_fixes = define_location_fixes('source-data/zika_location_fix.tsv')
# date_fixes = define_date_fixes('source-data/zika_date_fix.tsv')

for index, record in enumerate(stdin):
record = json.loads(record)
record["strain"] = _set_strain_name(record)
record["strain"] = _set_strain_name(record, strain_fixes)
record["url"] = _set_url(record)
record["paper_url"] = _set_paper_url(record)
record["authors"] = record["abbr_authors"]
Expand Down
2 changes: 2 additions & 0 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ transform:
# Local geolocation rules that are only applicable to dengue data
# Local rules can overwrite the general geolocation rules provided above
local_geolocation_rules: 'source-data/geolocation-rules.tsv'
# User strain fixes file
strain_fixes: 'source-data/zika_strain_name_fix.tsv'
# User annotations file
annotations: 'source-data/annotations.tsv'
# ID field used to merge annotations
Expand Down
2 changes: 2 additions & 0 deletions ingest/source-data/zika_date_fix.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
label fix
HN16 2016-05-15
52 changes: 52 additions & 0 deletions ingest/source-data/zika_location_fix.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
label fix
PHE_Guadeloupe guadeloupe
Haiti/2016/PD haiti
Dominican_Republic/2016/PD1 dominican_republic
Haiti/1/2016 haiti
SZ_WIV01 american_samoa
CN/SZ02/2016 american_samoa
SZ01/2016/China american_samoa
FB_GWUH_2016 guatemala
Martinique/2016/FL001Sa martinique
PuertoRico/2016/FL016U puerto_rico
PuertoRico/2016/FL008U puerto_rico
GZ01 venezuela
GZ02/2016 venezuela
Z16019 venezuela
NIID123/2016 vietnam
USA/UT_1/2016 mexico
Brazil/2016/INMI1 brazil
ZJ03 american_samoa
Z16006 american_samoa
Zhejiang04 american_samoa
VE_Ganxian venezuela
GD01 venezuela
GDZ16001 venezuela
Chiba/S36/2016 fiji
SMGC_1 american_samoa
ZJ02 american_samoa
ZKC2/2016 american_samoa
UNK/2016/MA_WGS16_029 el_salvador
THA/PLCal_ZV/2013 thailand
Dominican_Rep_Rus_7EGR_2016 dominican_republic
Dominican_Rep_Rus_8ZBR_2016 dominican_republic
Dominican_Rep_Rus_5RMN_2016 dominican_republic
Dominican_Rep_Rus_4MRG_2016 dominican_republic
Dominican_Rep_Rus_3ALT_2016 dominican_republic
Mexico_Rus_12TVR_2017 mexico
Mexico_Rus_10GNN_2016 mexico
Saint_Barthelemi_Rus_6BRN_2016 saint_barthelemy
Cuba/2017/Hu0046Sa cuba
VIE/Bra/2016 brazil
AFMC_S philippines
AFMC_U philippines
Dominican_Republic/2016/ZB dominican_republic
Henan/001/2016 guatemala
Thailand/1610acTw thailand
Thai/KngSG/17_D501 thailand
SG_EHI_/33164Y17 cuba
USVI/25/2016 usvi
USVI/35/2016 usvi
USVI/4/2016 usvi
USVI/43/2016 usvi
USVI/5/2016 usvi
107 changes: 107 additions & 0 deletions ingest/source-data/zika_strain_name_fix.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
label fix
PRI/PRVABC59/2015 PRVABC59
PRVABC_59 PRVABC59
Ae-aegypti-FL01M Aedes_aegypti/USA/2016/FL01M
Ae-aegypti-FL02M Aedes_aegypti/USA/2016/FL02M
Ae-aegypti-FL03M Aedes_aegypti/USA/2016/FL03M
Ae-aegypti-FL04M Aedes_aegypti/USA/2016/FL04M
Ae-aegypti-FL05M Aedes_aegypti/USA/2016/FL05M
Ae-aegypti-FL06M Aedes_aegypti/USA/2016/FL06M
Ae-aegypti_FL08M Aedes_aegypti/USA/2016/FL08M
FL001Sa Martinique/2016/FL001Sa
FL008U PuertoRico/2016/FL008U
FL010U USA/2016/FL010U
FL016U PuertoRico/2016/FL016U
FL021U USA/2016/FL021U
FL022U USA/2016/FL022U
FL030U USA/2016/FL030U
FL032U USA/2016/FL032U
FL036SE USA/2016/FL036Se
FL038U USA/2016/FL038U
FL039U USA/2016/FL039U
Hu0015Sa USA/2016/Hu0015SA
ZC188Se Colombia/2016/ZC188Se
ZC192Se Colombia/2016/ZC192Se
ZC204Se Colombia/2016/ZC204Se
ZC207Se Colombia/2016/ZC207Se
Aaegypti_wt/USA/2016/FL_01_MOS Aedes_aegypti/USA/2016/FL01M
Aaegypti_wt/USA/2016/FL_02_MOS Aedes_aegypti/USA/2016/FL02M
Aaegypti_wt/USA/2016/FL_03_MOS Aedes_aegypti/USA/2016/FL03M
Aaegypti_wt/USA/2016/FL_04_MOS Aedes_aegypti/USA/2016/FL04M
USA/2016/FL_010_URI USA/2016/FL010U
USA/2016/FL_021_URI USA/2016/FL021U
USA/2016/FL_022_URI USA/2016/FL022U
USA/2016/FL_030_URI USA/2016/FL030U
USA/2016/FL_032_URI USA/2016/FL032U
USA/2016/FL_038_URI USA/2016/FL038U
USA/2016/FL_039_URI USA/2016/FL039U
USA/2016/FL001Sa Martinique/2016/FL001Sa
USA/2016/FL008U PuertoRico/2016/FL008U
USA/2016/FL016U PuertoRico/2016/FL016U
FLUR001 USA/2016/FLUR002
FLUR002 USA/2016/FLUR002
FLUR005 USA/2016/FLUR005
FLUR006 USA/2016/FLUR006
FLUR007 USA/2016/FLUR007
FLUR008 USA/2016/FLUR008
FLUR009 USA/2016/FLUR009
FLUR011 USA/2016/FLUR011
FLUR013 USA/2016/FLUR013
FLUR014 USA/2016/FLUR014
FLUR015 USA/2016/FLUR015
FLUR022 USA/2016/FLUR022
FLUR026 USA/2016/FLUR026
FLSR036 USA/2016/FLSR036
FLWB042 USA/2016/FLWB042
FLSR043 USA/2016/FLSR043
FLWB044 USA/2016/FLWB044
FLUR057 USA/2016/FLUR057
FLUR058 USA/2016/FLUR058
FLUR063 USA/2016/FLUR063
MEX_CIENI551P4 MEX_CIENI551
FLR COL/FLR/2015
NIC/4886_12A1_SP/2016 NIC/4886_12A1/2016
NIC/5005_13A1_SP/2016 NIC/5005_13A1/2016
NIC/6188_13A1_SP/2016 NIC/6188_13A1/2016
NIC/6406_13A1_SP/2016 NIC/6406_13A1/2016
NIC/7252_12A1_SP/2016 NIC/7252_12A1/2016
PHE_semen_Guadeloupe PHE_Guadeloupe
USA/2016/FL_010 USA/2016/FL010
USA/2016/FL010U USA/2016/FL010
USA/2016/FL021U USA/2016/FL021
USA/2016/FL_021 USA/2016/FL021
USA/2016/FL022U USA/2016/FL022
USA/2016/FL_022 USA/2016/FL022
USA/2016/FL030U USA/2016/FL030
USA/2016/FL_030 USA/2016/FL030
USA/2016/FL032U USA/2016/FL032
USA/2016/FL_032 USA/2016/FL032
USA/2016/FL_035 USA/2016/FL035
USA/2016/FL_036 USA/2016/FL036
USA/2016/FL_038 USA/2016/FL038
USA/2016/FL_039 USA/2016/FL039
USA/2016/FL036Se USA/2016/FL036
USA/2016/FL038U USA/2016/FL038
USA/2016/FL039U USA/2016/FL039
Aedes_aegypti/USA/2016/FL01M Aedes_aegypti/USA/2016/FL01
Aaegypti_wt/USA/2016/FL_01 Aedes_aegypti/USA/2016/FL01
Aedes_aegypti/USA/2016/FL02M Aedes_aegypti/USA/2016/FL02
Aaegypti_wt/USA/2016/FL_02 Aedes_aegypti/USA/2016/FL02
Aedes_aegypti/USA/2016/FL03M Aedes_aegypti/USA/2016/FL03
Aaegypti_wt/USA/2016/FL_03 Aedes_aegypti/USA/2016/FL03
Aedes_aegypti/USA/2016/FL04M Aedes_aegypti/USA/2016/FL04
Aaegypti_wt/USA/2016/FL_04 Aedes_aegypti/USA/2016/FL04
Aedes_aegypti/USA/2016/FL05M Aedes_aegypti/USA/2016/FL05
Aaegypti_wt/USA/2016/FL_05 Aedes_aegypti/USA/2016/FL05
Aedes_aegypti/USA/2016/FL06M Aedes_aegypti/USA/2016/FL06
Aaegypti_wt/USA/2016/FL_06 Aedes_aegypti/USA/2016/FL06
Aedes_aegypti/USA/2016/FL08M Aedes_aegypti/USA/2016/FL08
Aaegypti_wt/USA/2016/FL_08 Aedes_aegypti/USA/2016/FL08
MEX/InDRE/Lm/2016 MEX/InDRE/2016
MEX/InDRE/Sm/2016 MEX/InDRE/2016
Martinique/2016/FL001Sa Martinique/2016/FL001
MTQ/2016/FL_001 Martinique/2016/FL001
PLCal_ZV THA/PLCal_ZV/2013
USA/2016/FL_019 USA/2016/FL019
USA/2016/FL_028 USA/2016/FL028
USA/2016/FL_029 USA/2016/FL029
2 changes: 2 additions & 0 deletions ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ rule transform:
input:
sequences_ndjson="data/sequences_{serotype}.ndjson",
all_geolocation_rules="data/all-geolocation-rules.tsv",
strain_fixes=config["transform"]["strain_fixes"],
annotations=config["transform"]["annotations"],
output:
metadata="data/metadata_{serotype}.tsv",
Expand Down Expand Up @@ -86,6 +87,7 @@ rule transform:
| ./bin/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/post_process_metadata.py \
--strain-fixes {input.strain_fixes} \
| ./bin/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down

0 comments on commit 8b28622

Please sign in to comment.