Skip to content

Commit

Permalink
[fauna] Rescue fauna data processing steps
Browse files Browse the repository at this point in the history
Rescue some of the original functionality of the zika_upload script from fauna.
https://github.com/nextstrain/fauna/blob/master/vdb/zika_upload.py#L14-L30

* Remove monkeypox annotations.tsv
* Move strain name, location, and date fixes to annotations.tsv
* Match strain names in fauna database
* Match locations in fauna database if both region and country names do not match
* Match dates in fauna database unless genbank has been updated
  • Loading branch information
j23414 committed Aug 21, 2023
1 parent 2d755bb commit 729d676
Show file tree
Hide file tree
Showing 2 changed files with 253 additions and 274 deletions.
16 changes: 16 additions & 0 deletions ingest/bin/post_process_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import pandas as pd


import re

def parse_args():
parser = argparse.ArgumentParser(
description="Reformat a NCBI Virus metadata.tsv file for a pathogen build."
Expand All @@ -27,6 +29,20 @@ def parse_args():
def _set_strain_name(record):
"""Replace spaces, dashes, and periods with underscores in strain name."""
strain_name = record["strain"]

strain_name = strain_name.replace('Zika_virus', '').replace('Zikavirus', '').replace('Zika virus', '').replace('Zika', '').replace('ZIKV', '')
strain_name = strain_name.replace('Human', '').replace('human', '').replace('H.sapiens_wt', '').replace('H.sapiens-wt', '').replace('H.sapiens_tc', '').replace('Hsapiens_tc', '').replace('H.sapiens-tc', '').replace('Homo_sapiens', '').replace('Homo sapiens', '').replace('Hsapiens', '').replace('H.sapiens', '')
strain_name = strain_name.replace('/Hu/', '')
strain_name = strain_name.replace('_Asian', '').replace('_Asia', '').replace('_asian', '').replace('_asia', '')
strain_name = strain_name.replace('_URI', '').replace('-URI', '').replace('_SER', '').replace('-SER', '').replace('_PLA', '').replace('-PLA', '').replace('_MOS', '').replace('_SAL', '')
strain_name = strain_name.replace('Aaegypti_wt', 'Aedes_aegypti').replace('Aedessp', 'Aedes_sp')
strain_name = strain_name.replace(' ', '').replace('\'', '').replace('(', '').replace(')', '').replace('//', '/').replace('__', '_').replace('.', '').replace(',', '')
strain_name = re.sub('^[\/\_\-]', '', strain_name)

try:
strain_name = 'V' + str(int(strain_name))
except ValueError:
pass

return (
strain_name.replace(" ", "_")
Expand Down
Loading

0 comments on commit 729d676

Please sign in to comment.