diff --git a/vendored/.github/workflows/ci.yaml b/vendored/.github/workflows/ci.yaml index bbf40f72..c6a218a5 100644 --- a/vendored/.github/workflows/ci.yaml +++ b/vendored/.github/workflows/ci.yaml @@ -1,9 +1,11 @@ name: CI on: - - push - - pull_request - - workflow_dispatch + push: + branches: + - main + pull_request: + workflow_dispatch: jobs: shellcheck: @@ -18,4 +20,4 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 - run: pip install cram - - run: cram tests/ + - run: cram tests/ \ No newline at end of file diff --git a/vendored/.gitrepo b/vendored/.gitrepo index fc2150f5..13a71698 100644 --- a/vendored/.gitrepo +++ b/vendored/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/nextstrain/ingest branch = main - commit = c97df238518171c2b1574bec0349a55855d1e7a7 - parent = 6ef4dc097df037130845d002e54eb4b7338e3d5b + commit = 7617c39fae05e5882c5e6c065c5b47d500c998af + parent = 6c0a9cc7a1c3cfc6a055707a0eb661af56befeb6 method = merge cmdver = 0.4.6 diff --git a/vendored/README.md b/vendored/README.md index 533b39ad..fa918913 100644 --- a/vendored/README.md +++ b/vendored/README.md @@ -25,6 +25,31 @@ Any future updates of ingest scripts can be pulled in with: git subrepo pull ingest/vendored ``` +If you run into merge conflicts and would like to pull in a fresh copy of the +latest ingest scripts, pull with the `--force` flag: + +``` +git subrepo pull ingest/vendored --force +``` + +> **Warning** +> Beware of rebasing/dropping the parent commit of a `git subrepo` update + +`git subrepo` relies on metadata in the `ingest/vendored/.gitrepo` file, +which includes the hash for the parent commit in the pathogen repos. +If this hash no longer exists in the commit history, there will be errors when +running future `git subrepo pull` commands. + +If you run into an error similar to the following: +``` +$ git subrepo pull ingest/vendored +git-subrepo: Command failed: 'git branch subrepo/ingest/vendored '. +fatal: not a valid object name: '' +``` +Check the parent commit hash in the `ingest/vendored/.gitrepo` file and make +sure the commit exists in the commit history. Update to the appropriate parent +commit hash if needed. + ## History Much of this tooling originated in @@ -72,10 +97,9 @@ Scripts for supporting ingest workflow automation that don’t really belong in NCBI interaction scripts that are useful for fetching public metadata and sequences. - [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file. - Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/) or [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs. -- [fetch-from-ncbi-virus](fetch-from-ncbi-virus) - Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) and output NDJSON records to stdout. -- [ncbi-virus-url](ncbi-virus-url) - Generates the URL to download metadata and sequences from NCBI Virus as a single CSV file. -- [csv-to-ndjson](csv-to-ndjson) - Converts CSV file to NDJSON file with a hard-coded 200MiB field size limit to accommodate sequences in the NCBI Virus download. + Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs. + +Historically, some pathogen repos used the undocumented NCBI Virus API through [fetch-from-ncbi-virus](https://github.com/nextstrain/ingest/blob/c97df238518171c2b1574bec0349a55855d1e7a7/fetch-from-ncbi-virus) to fetch data. However we've opted to drop the NCBI Virus scripts due to https://github.com/nextstrain/ingest/issues/18. Potential Nextstrain CLI scripts @@ -97,6 +121,7 @@ Potential augur curate scripts - [transform-authors](transform-authors) - Abbreviates full author lists to ' et al.' - [transform-field-names](transform-field-names) - Rename fields of NDJSON records - [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"[:][, ]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/) +- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields. ## Software requirements diff --git a/vendored/csv-to-ndjson b/vendored/csv-to-ndjson deleted file mode 100755 index 84befe08..00000000 --- a/vendored/csv-to-ndjson +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert CSV on stdin to NDJSON on stdout. -usage: `cat dummy.csv | ./csv-to-ndjson > dummy.ndjson` -""" -import csv -import json -from sys import stdin, stdout - -# 200 MiB; default is 128 KiB -csv.field_size_limit(200 * 1024 * 1024) - -for row in csv.DictReader(stdin): - json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:') - print() diff --git a/vendored/docs/ncbi-virus-all-fields-example.json b/vendored/docs/ncbi-virus-all-fields-example.json deleted file mode 100644 index bbf9a7f7..00000000 --- a/vendored/docs/ncbi-virus-all-fields-example.json +++ /dev/null @@ -1,292 +0,0 @@ -{ - "ExportDate_dt": "2023-08-08T21:02:01.475Z", - "QualNum_i": 0, - "QualPct_d": 0.0, - "IncompleteCdsCnt_i": 0, - "gi_l": 1798174254, - "Host_s": "Homo sapiens", - "HostSpecies_s": "Homo sapiens (human), taxid:9606|", - "HostLineage_ss": [ - "cellular organisms, taxid:131567| biota", - "Eukaryota (eucaryotes), taxid:2759| eukaryotes Eucarya Eucaryotae Eukarya Eukaryotae", - "Opisthokonta, taxid:33154| Fungi/Metazoa group opisthokonts", - "Metazoa (metazoans), taxid:33208| multicellular animals Animalia animals", - "Eumetazoa, taxid:6072|", - "Bilateria, taxid:33213|", - "Deuterostomia (deuterostomes), taxid:33511|", - "Chordata (chordates), taxid:7711|", - "Craniata, taxid:89593|", - "Vertebrata (vertebrates), taxid:7742|", - "Gnathostomata (jawed vertebrates), taxid:7776|", - "Teleostomi, taxid:117570|", - "Euteleostomi (bony vertebrates), taxid:117571|", - "Sarcopterygii, taxid:8287|", - "Dipnotetrapodomorpha, taxid:1338369|", - "Tetrapoda (tetrapods), taxid:32523|", - "Amniota (amniotes), taxid:32524|", - "Mammalia (mammals), taxid:40674|", - "Theria, taxid:32525|", - "Eutheria (placentals), taxid:9347| eutherian mammals placental mammals Placentalia", - "Boreoeutheria, taxid:1437010| Boreotheria", - "Euarchontoglires, taxid:314146|", - "Primates, taxid:9443| Primata primates", - "Haplorrhini, taxid:376913|", - "Simiiformes, taxid:314293| Anthropoidea", - "Catarrhini, taxid:9526|", - "Hominoidea (apes), taxid:314295| ape", - "Hominidae (great apes), taxid:9604| Pongidae", - "Homininae, taxid:207598| Homo/Pan/Gorilla group", - "Homo (humans), taxid:9605|", - "Homo sapiens (human), taxid:9606|" - ], - "HostLineageId_ss": [ - "131567", - "2759", - "33154", - "33208", - "6072", - "33213", - "33511", - "7711", - "89593", - "7742", - "7776", - "117570", - "117571", - "8287", - "1338369", - "32523", - "32524", - "40674", - "32525", - "9347", - "1437010", - "314146", - "9443", - "376913", - "314293", - "9526", - "314295", - "9604", - "207598", - "9605", - "9606" - ], - "Locus_s": "NC_045512", - "OrgId_i": 2697049, - "VirusFamily_s": "Coronaviridae", - "VirusGenus_s": "Betacoronavirus", - "VirusSpecies_s": "Severe acute respiratory syndrome-related coronavirus", - "VirusSpeciesId_i": 694009, - "VirusLineage_ss": [ - "Viruses, taxid:10239| Vira Viridae viruses", - "Riboviria (RNA viruses), taxid:2559587| RNA viruses and viroids", - "Orthornavirae, taxid:2732396|", - "Pisuviricota, taxid:2732408|", - "Pisoniviricetes, taxid:2732506|", - "Nidovirales, taxid:76804|", - "Cornidovirineae, taxid:2499399|", - "Coronaviridae, taxid:11118|", - "Orthocoronavirinae, taxid:2501931|", - "Betacoronavirus, taxid:694002| Coronavirus", - "Sarbecovirus, taxid:2509511|", - "Severe acute respiratory syndrome-related coronavirus, taxid:694009| HCoV-SARS SARS SARSr-CoV SARSrCoV", - "Severe acute respiratory syndrome coronavirus 2, taxid:2697049| SARS-CoV-2", - "RNA viruses" - ], - "VirusLineageId_ss": [ - "10239", - "2559587", - "2732396", - "2732408", - "2732506", - "76804", - "2499399", - "11118", - "2501931", - "694002", - "2509511", - "694009", - "2697049" - ], - "VirusL0_s": "RNA viruses", - "VirusL1_s": "Orthornavirae, taxid:2732396", - "VirusL2_s": "Pisuviricota, taxid:2732408", - "VirusL3_s": "Pisoniviricetes, taxid:2732506", - "VirusL4_s": "Nidovirales, taxid:76804", - "VirusL5_s": "Cornidovirineae, taxid:2499399", - "VirusL6_s": "Coronaviridae, taxid:11118", - "VirusL7_s": "Orthocoronavirinae, taxid:2501931", - "VirusL8_s": "Betacoronavirus, taxid:694002", - "VirusL9_s": "Sarbecovirus, taxid:2509511", - "VirusL10_s": "Severe acute respiratory syndrome-related coronavirus, taxid:694009", - "ViralHost_ss": [ - "human", - "vertebrates" - ], - "GenomicMoltype_s": "ssRNA(+)", - "SLen_i": 29903, - "Flags_ss": [ - "refseq", - "complete" - ], - "Flags_csv": "refseq, complete", - "FlagsCount_i": 2, - "SetAcc_s": "GCF_009858895.2", - "Authors_ss": [ - "Wu,F.", - "Zhao,S.", - "Yu,B.", - "Chen,Y.M.", - "Wang,W.", - "Song,Z.G.", - "Hu,Y.", - "Tao,Z.W.", - "Tian,J.H.", - "Pei,Y.Y.", - "Yuan,M.L.", - "Zhang,Y.L.", - "Dai,F.H.", - "Liu,Y.", - "Wang,Q.M.", - "Zheng,J.J.", - "Xu,L.", - "Holmes,E.C.", - "Zhang,Y.Z.", - "Baranov,P.V.", - "Henderson,C.M.", - "Anderson,C.B.", - "Gesteland,R.F.", - "Atkins,J.F.", - "Howard,M.T.", - "Robertson,M.P.", - "Igel,H.", - "Baertsch,R.", - "Haussler,D.", - "Ares,M. Jr.", - "Scott,W.G.", - "Williams,G.D.", - "Chang,R.Y.", - "Brian,D.A.", - "Chen,Y.-M.", - "Song,Z.-G.", - "Tao,Z.-W.", - "Tian,J.-H.", - "Pei,Y.-Y.", - "Zhang,Y.-L.", - "Dai,F.-H.", - "Wang,Q.-M.", - "Zheng,J.-J.", - "Zhang,Y.-Z." - ], - "Authors_csv": "Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y., Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H., Liu,Y., Wang,Q.M., Zheng,J.J., Xu,L., Holmes,E.C., Zhang,Y.Z., Baranov,P.V., Henderson,C.M., Anderson,C.B., Gesteland,R.F., Atkins,J.F., Howard,M.T., Robertson,M.P., Igel,H., Baertsch,R., Haussler,D., Ares,M. Jr., Scott,W.G., Williams,G.D., Chang,R.Y., Brian,D.A., Chen,Y.-M., Song,Z.-G., Tao,Z.-W., Tian,J.-H., Pei,Y.-Y., Zhang,Y.-L., Dai,F.-H., Wang,Q.-M., Zheng,J.-J., Zhang,Y.-Z.", - "AuthorsCount_i": 44, - "Country_s": "China", - "Isolate_s": "Wuhan-Hu-1", - "Lineage_s": "B", - "Division_s": "VRL", - "Keywords_ss": [ - "RefSeq" - ], - "KeywordsCount_i": 1, - "TaxName_s": "Severe acute respiratory syndrome coronavirus 2", - "Region_s": "Asia", - "ParentAcc_s": "set:NC_045512", - "SetPosition_i": 0, - "SourceDB_s": "RefSeq", - "Definition_s": "Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome", - "HostId_i": 9606, - "CreateDate_dt": "2020-01-13T00:00:00Z", - "CreateYear_i": 2020, - "Genome_js": "[{\"id\": \"NC_045512.2\", \"segment\": null, \"proteins\": [{\"id\": \"YP_009724389.1\", \"name\": \"ORF1ab polyprotein\", \"location\": \"join(266..13468,13468..21555)\"}, {\"id\": \"YP_009725295.1\", \"name\": \"ORF1a polyprotein\", \"location\": \"266..13483\"}, {\"id\": \"YP_009724390.1\", \"name\": \"surface glycoprotein\", \"location\": \"21563..25384\"}, {\"id\": \"YP_009724391.1\", \"name\": \"ORF3a protein\", \"location\": \"25393..26220\"}, {\"id\": \"YP_009724392.1\", \"name\": \"envelope protein\", \"location\": \"26245..26472\"}, {\"id\": \"YP_009724393.1\", \"name\": \"membrane glycoprotein\", \"location\": \"26523..27191\"}, {\"id\": \"YP_009724394.1\", \"name\": \"ORF6 protein\", \"location\": \"27202..27387\"}, {\"id\": \"YP_009724395.1\", \"name\": \"ORF7a protein\", \"location\": \"27394..27759\"}, {\"id\": \"YP_009725318.1\", \"name\": \"ORF7b\", \"location\": \"27756..27887\"}, {\"id\": \"YP_009724396.1\", \"name\": \"ORF8 protein\", \"location\": \"27894..28259\"}, {\"id\": \"YP_009724397.2\", \"name\": \"nucleocapsid phosphoprotein\", \"location\": \"28274..29533\"}, {\"id\": \"YP_009725255.1\", \"name\": \"ORF10 protein\", \"location\": \"29558..29674\"}]}]", - "MolType_s": "RNA", - "ProtAcc_ss": [ - "YP_009724389", - "YP_009725295", - "YP_009724390", - "YP_009724391", - "YP_009724392", - "YP_009724393", - "YP_009724394", - "YP_009724395", - "YP_009725318", - "YP_009724396", - "YP_009724397", - "YP_009725255" - ], - "ProtAccCount_i": 12, - "UpdateDate_dt": "2020-07-18T00:00:00Z", - "UpdateYear_i": 2020, - "PubMed_ss": [ - "32015508", - "15680415", - "15630477", - "10482585" - ], - "PubMed_csv": "32015508, 15680415, 15630477, 10482585", - "PubMedCount_i": 4, - "Completeness_s": "complete", - "CountryFull_s": "China", - "ProtNames_ss": [ - "ORF1ab polyprotein", - "ORF1a polyprotein", - "surface glycoprotein", - "ORF3a protein", - "envelope protein", - "membrane glycoprotein", - "ORF6 protein", - "ORF7a protein", - "ORF7b protein", - "ORF8 protein", - "nucleocapsid phosphoprotein", - "ORF10 protein" - ], - "ProtNamesCount_i": 12, - "IsolateParsed_s": "Wuhan-Hu-1", - "NuclAcc_ss": [ - "NC_045512" - ], - "NuclAccCount_i": 1, - "CollectionDate_dr": "2019-12", - "CollectionYear_i": 2019, - "SubmitterAffil_s": "National Center for Biotechnology Information, NIH", - "BioProject_ss": [ - "PRJNA485481" - ], - "BioProject_csv": "PRJNA485481", - "BioProjectCount_i": 1, - "AccVer_s": "NC_045512.2", - "CollectionDate_s": "2019-12", - "SubmitterCountry_s": "USA", - "CollectionDate_dt": "2019-12-01T00:00:00Z", - "GenomeCompleteness_s": "complete", - "SubmitterAffilFull_s": "National Center for Biotechnology Information, NIH", - "BioProject_s": "PRJNA485481", - "AccNV_s": "NC_045512", - "id": "NC_045512", - "SeqType_s": "Nucleotide", - "FastaMD5_s": "4928f859a1822d291e0225206a0068c8", - "live_i": 1, - "ids_ss": [ - "GCF_009858895", - "GCF_009858895.2", - "NC_045512", - "NC_045512.2", - "PRJNA485481", - "YP_009724389", - "YP_009724390", - "YP_009724391", - "YP_009724392", - "YP_009724393", - "YP_009724394", - "YP_009724395", - "YP_009724396", - "YP_009724397", - "YP_009725255", - "YP_009725295", - "YP_009725318", - "set:NC_045512" - ], - "gi_i": 1798174254, - "_version_": 1773711315042304000 -} diff --git a/vendored/fetch-from-ncbi-virus b/vendored/fetch-from-ncbi-virus deleted file mode 100755 index 39733e6f..00000000 --- a/vendored/fetch-from-ncbi-virus +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -# usage: fetch-from-ncbi-virus [options] -# -# Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) -# and output NDJSON records to stdout. -# -# [options] are passed directly to ncbi-virus-url. See that script for usage details. -# -# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest: -# https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank -# -set -euo pipefail - -bin="$(dirname "$0")" - - -main() { - local ncbi_taxon_id="${1:?NCBI taxon id is required.}" - local github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" - - # "${@:3}" represents all other options, if any. - ncbi_virus_url="$("$bin"/ncbi-virus-url --ncbi-taxon-id "$ncbi_taxon_id" "${@:3}")" - - fetch "$ncbi_virus_url" "$github_repo" | "$bin"/csv-to-ndjson -} - -fetch() { - curl "$1" \ - --fail --silent --show-error --http1.1 \ - --header "User-Agent: https://github.com/$2 (hello@nextstrain.org)" -} - -main "$@" diff --git a/vendored/ncbi-virus-url b/vendored/ncbi-virus-url deleted file mode 100755 index 0dd116b8..00000000 --- a/vendored/ncbi-virus-url +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -""" -Generate URL to download all virus sequences and their curated metadata for a -specified NCBI Taxon ID from GenBank via NCBI Virus. - -The URL this program builds is based on the URL for SARS-CoV-2 constructed with - - https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/genbank-url - -and observing the network activity at - - https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide -""" -from urllib.parse import urlencode -from typing import List, Optional -import argparse - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--ncbi-taxon-id", required=True, - help="NCBI Taxon ID. Visit NCBI virus at " + - "https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/find-data/virus " + - "to search for supported taxon IDs.") - parser.add_argument("--filters", required=False, nargs="*", - help="Filter criteria to add as `fq` param values. " + - "Apply filters via the NCBI Virus UI and observe the network " + - "activity to find the desired filter string.") - parser.add_argument("--fields", required=False, nargs="*", - help="Metadata fields to add as `fl` param values. " + - "Expected to be formatted as :. " + - "See docs/ncbi-virus-all-fields-example.json for the available NCBI Virus fields.") - return parser.parse_args() - -def build_query_url(ncbi_taxon_id: str, - filters: Optional[List[str]]=None, - fields: Optional[List[str]]=None): - """ - Generate URL to download all viral sequences and their curated metadata - from GenBank via NCBI Virus. - """ - endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/" - params = { - # Search criteria - 'fq': [ - '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein) - f'VirusLineageId_ss:({ncbi_taxon_id})', - *(filters or []), - ], - - # Unclear, but seems necessary. - 'q': '*:*', - - # Response format - 'cmd': 'download', - 'dlfmt': 'csv', - 'fl': ','.join( - [':'.join(names) for names in [ - # Pairs of (output column name, source data field). - ('genbank_accession', 'id'), - ('genbank_accession_rev', 'AccVer_s'), - ('database', 'SourceDB_s'), - ('strain', 'Isolate_s'), - ('region', 'Region_s'), - ('location', 'CountryFull_s'), - ('collected', 'CollectionDate_s'), - ('submitted', 'CreateDate_dt'), - ('updated', 'UpdateDate_dt'), - ('length', 'SLen_i'), - ('host', 'Host_s'), - ('isolation_source', 'Isolation_csv'), - ('bioproject_accession', 'BioProject_s'), - ('biosample_accession', 'BioSample_s'), - ('sra_accession', 'SRALink_csv'), - ('title', 'Definition_s'), - ('authors', 'Authors_csv'), - ('submitting_organization', 'SubmitterAffilFull_s'), - ('publications', 'PubMed_csv'), - ('sequence', 'Nucleotide_seq'), - ]] + (fields or []) - ), - - # Stable sort with GenBank accessions. - # Columns are source data fields, not our output columns. - 'sort': 'id asc', - - # This isn't Entrez, but include the same email parameter it requires just - # to be nice. - 'email': 'hello@nextstrain.org', - } - query = urlencode(params, doseq = True, encoding = "utf-8") - - print(f"{endpoint}?{query}") - -def main(): - args = parse_args() - build_query_url( - ncbi_taxon_id=args.ncbi_taxon_id, - filters=args.filters, - fields=args.fields - ) - -if __name__ == '__main__': - main() diff --git a/vendored/tests/fetch-from-ncbi-virus/filter-and-fields.t b/vendored/tests/fetch-from-ncbi-virus/filter-and-fields.t deleted file mode 100644 index 2fd7020f..00000000 --- a/vendored/tests/fetch-from-ncbi-virus/filter-and-fields.t +++ /dev/null @@ -1,18 +0,0 @@ -Get the virus lineage IDs for 4 early Dengue sequences, testing the options --filter and --field. - - $ $TESTDIR/../../fetch-from-ncbi-virus 12637 nextstrain/ingest \ - > --filters 'CreateDate_dt:([1987-11-29T00:00:00Z TO 1987-11-29T00:00:01Z])' \ - > --fields 'viruslineage_ids:VirusLineageId_ss' - {"genbank_accession":"X05375","genbank_accession_rev":"X05375.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for envelope protein E N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"GTAACTTATGGGACGTGTACCACCACAGGAGAACACAGAAGAGAAAAAAGATCAGTGGCACTCGTTCCACATGTGGGAATGGGACTGGAGACACGAACTGAAACATGGATGTCATCAGAAGGGGCCTGGAAACATGCCCAGAGAATTGAAACTTGGATCTTGAGACATCCAGGCTTTACCATAATGGCAGCAATCCTGGCATACACCATAGGAACGACACATTTCCAAAGAGCCCTGATTTTCATCTTACTGACAGCTGTCGCTCCTTCAATGACAATGCGTTGCATAGGAATATCAAATAGAGACTTTGTAGAAGGGGTTTCAGGAGGAAGCTGGGTTGACATAGTCTTAGAACATGGA","viruslineage_ids":"10239,2559587,2732396,2732406,2732462,2732545,11050,11051,12637,11060"} - {"genbank_accession":"X05376","genbank_accession_rev":"X05376.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS1 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"ACAACAATGAGGGGAGCGAAGAGAATGGCCATTTTAGGTGACACAGCTTGGGATTTTGGATCCCTGGGAGGAGTGTTTACATCTATAGGAAAGGCTCTCCACCAAGTTTTCGGAGCAATCTATGGGGCTGCCTTCAGTGGGGTCTCATGGACTATGAAAATCCTCATAGGAGTCATTATCACATGGATAGGAATGAATTCACGCAGCACCTCACTTTCTGTGTCACTAGTATTGGTGGGAGTCGTGACGCTGTATTTGGGAGTTATGGTGCAGGCCGATAGTGGTTGCGTTGTGAGCTGGAAAAACAAAGAACTGAAGTGTGGCAGTGGGATTTTCATCACAGACAACGTGCACACATGG","viruslineage_ids":"10239,2559587,2732396,2732406,2732462,2732545,11050,11051,12637,11060"} - {"genbank_accession":"X05377","genbank_accession_rev":"X05377.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS3 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"CTCACTGTGTGCTACGTGCTCACTGGACGATCGGCCGATTTGGAACTGGAGAGAGCCGCCGATGTCAAATGGGAAGATCAGGCAGAGATATCAGGAAGCAGTCCAATCCTGTCAATAACAATATCAGAAGATGGTAGCATGTCGATAAAAAACGAAGAGGAAGAACAAACACTGACCATACTCATTAGAACAGGATTGCTGGTGATCTCAGGACTTTTTCCTGTATCAATACCAATCACGGCAGCAGCATGGTACCTGTGGGAAGTGAAGAAACAACGGGCTGGAGTATTGTGGGATGTCCCTTCACCCCCACCCGTGGGAAAGGCTGAACTGGAAGATGGAGCCTATAGAATCAAGCAA","viruslineage_ids":"10239,2559587,2732396,2732406,2732462,2732545,11050,11051,12637,11060"} - {"genbank_accession":"X05378","genbank_accession_rev":"X05378.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS5 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"GATCCAATACCCTATGATCCAAAGTTTGAAAAGCAGTTGGGACAAGTAATGCTCCTAGTCCTCTGCGGGACTCAAGTGTTGATGATGAGGACTACATGGGCTCTGTGTGAGGCTTTAACCTTAGCGACCGGGCCTATCTCCACATTGTGGGAAGGAAATCCAGGGAGGTTTTGGAACACTACCATTGCAGTGTCAATGGCTAACATTTTTAGAGGGAGTTACTTGGCCGGAGCTGGACTTCTCTTTTCCATCATGAAGAACACAACCAACACGAGAAGGGGAACTGGCAACATAGGAGAGACGCTTGGAGAGAAATGGAAAAGCCGATTGAACGCATTGGGGAAAAGTGAATTCCAGATC","viruslineage_ids":"10239,2559587,2732396,2732406,2732462,2732545,11050,11051,12637,11060"} - -Do the same but without --field. - - $ $TESTDIR/../../fetch-from-ncbi-virus 12637 nextstrain/ingest \ - > --filters 'CreateDate_dt:([1987-11-29T00:00:00Z TO 1987-11-29T00:00:01Z])' - {"genbank_accession":"X05375","genbank_accession_rev":"X05375.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for envelope protein E N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"GTAACTTATGGGACGTGTACCACCACAGGAGAACACAGAAGAGAAAAAAGATCAGTGGCACTCGTTCCACATGTGGGAATGGGACTGGAGACACGAACTGAAACATGGATGTCATCAGAAGGGGCCTGGAAACATGCCCAGAGAATTGAAACTTGGATCTTGAGACATCCAGGCTTTACCATAATGGCAGCAATCCTGGCATACACCATAGGAACGACACATTTCCAAAGAGCCCTGATTTTCATCTTACTGACAGCTGTCGCTCCTTCAATGACAATGCGTTGCATAGGAATATCAAATAGAGACTTTGTAGAAGGGGTTTCAGGAGGAAGCTGGGTTGACATAGTCTTAGAACATGGA"} - {"genbank_accession":"X05376","genbank_accession_rev":"X05376.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS1 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"ACAACAATGAGGGGAGCGAAGAGAATGGCCATTTTAGGTGACACAGCTTGGGATTTTGGATCCCTGGGAGGAGTGTTTACATCTATAGGAAAGGCTCTCCACCAAGTTTTCGGAGCAATCTATGGGGCTGCCTTCAGTGGGGTCTCATGGACTATGAAAATCCTCATAGGAGTCATTATCACATGGATAGGAATGAATTCACGCAGCACCTCACTTTCTGTGTCACTAGTATTGGTGGGAGTCGTGACGCTGTATTTGGGAGTTATGGTGCAGGCCGATAGTGGTTGCGTTGTGAGCTGGAAAAACAAAGAACTGAAGTGTGGCAGTGGGATTTTCATCACAGACAACGTGCACACATGG"} - {"genbank_accession":"X05377","genbank_accession_rev":"X05377.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS3 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"CTCACTGTGTGCTACGTGCTCACTGGACGATCGGCCGATTTGGAACTGGAGAGAGCCGCCGATGTCAAATGGGAAGATCAGGCAGAGATATCAGGAAGCAGTCCAATCCTGTCAATAACAATATCAGAAGATGGTAGCATGTCGATAAAAAACGAAGAGGAAGAACAAACACTGACCATACTCATTAGAACAGGATTGCTGGTGATCTCAGGACTTTTTCCTGTATCAATACCAATCACGGCAGCAGCATGGTACCTGTGGGAAGTGAAGAAACAACGGGCTGGAGTATTGTGGGATGTCCCTTCACCCCCACCCGTGGGAAAGGCTGAACTGGAAGATGGAGCCTATAGAATCAAGCAA"} - {"genbank_accession":"X05378","genbank_accession_rev":"X05378.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS5 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"GATCCAATACCCTATGATCCAAAGTTTGAAAAGCAGTTGGGACAAGTAATGCTCCTAGTCCTCTGCGGGACTCAAGTGTTGATGATGAGGACTACATGGGCTCTGTGTGAGGCTTTAACCTTAGCGACCGGGCCTATCTCCACATTGTGGGAAGGAAATCCAGGGAGGTTTTGGAACACTACCATTGCAGTGTCAATGGCTAACATTTTTAGAGGGAGTTACTTGGCCGGAGCTGGACTTCTCTTTTCCATCATGAAGAACACAACCAACACGAGAAGGGGAACTGGCAACATAGGAGAGACGCTTGGAGAGAAATGGAAAAGCCGATTGAACGCATTGGGGAAAAGTGAATTCCAGATC"} diff --git a/vendored/tests/fetch-from-ncbi-virus/invalid-taxon-id.t b/vendored/tests/fetch-from-ncbi-virus/invalid-taxon-id.t deleted file mode 100644 index 7a0d5223..00000000 --- a/vendored/tests/fetch-from-ncbi-virus/invalid-taxon-id.t +++ /dev/null @@ -1,4 +0,0 @@ -Fetch from an invalid Taxon ID without any additional options. -This should not error nor return any output. - - $ $TESTDIR/../../fetch-from-ncbi-virus INVALID_TAXID nextstrain/ingest diff --git a/vendored/tests/transform-strain-names/transform-strain-names.t b/vendored/tests/transform-strain-names/transform-strain-names.t new file mode 100644 index 00000000..1c05df7b --- /dev/null +++ b/vendored/tests/transform-strain-names/transform-strain-names.t @@ -0,0 +1,17 @@ +Look for strain name in "strain" or a list of backup fields. + +If strain entry exists, do not do anything. + + $ echo '{"strain": "i/am/a/strain", "strain_s": "other"}' \ + > | $TESTDIR/../../transform-strain-names \ + > --strain-regex '^.+$' \ + > --backup-fields strain_s accession + {"strain":"i/am/a/strain","strain_s":"other"} + +If strain entry does not exists, search the backup fields + + $ echo '{"strain_s": "other"}' \ + > | $TESTDIR/../../transform-strain-names \ + > --strain-regex '^.+$' \ + > --backup-fields accession strain_s + {"strain_s":"other","strain":"other"} \ No newline at end of file diff --git a/vendored/transform-strain-names b/vendored/transform-strain-names new file mode 100755 index 00000000..d86c0e40 --- /dev/null +++ b/vendored/transform-strain-names @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +""" +Verifies strain name pattern in the 'strain' field of the NDJSON record from +stdin. Adds a 'strain' field to the record if it does not already exist. + +Outputs the modified records to stdout. +""" +import argparse +import json +import re +from sys import stderr, stdin, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--strain-regex", default="^.+$", + help="Regex pattern for strain names. " + + "Strain names that do not match the pattern will be dropped.") + parser.add_argument("--backup-fields", nargs="*", + help="List of backup fields to use as strain name if the value in 'strain' " + + "does not match the strain regex pattern. " + + "If multiple fields are provided, will use the first field that has a non-empty string.") + + args = parser.parse_args() + + strain_name_pattern = re.compile(args.strain_regex) + + for index, record in enumerate(stdin): + record = json.loads(record) + + # Verify strain name matches the strain regex pattern + if strain_name_pattern.match(record.get('strain', '')) is None: + # Default to empty string if not matching pattern + record['strain'] = '' + # Use non-empty value of backup fields if provided + if args.backup_fields: + for field in args.backup_fields: + if record.get(field): + record['strain'] = str(record[field]) + break + + if record['strain'] == '': + print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr) + + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print()