Skip to content

Commit

Permalink
Refactor annotations out of config, into rule [#8]
Browse files Browse the repository at this point in the history
  • Loading branch information
genehack committed May 21, 2024
1 parent 7337084 commit f59c45a
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 7 deletions.
21 changes: 15 additions & 6 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,29 @@ viruses:
# for each virus, give the NCBI taxon ID (required for data fetch from
# NCBI datasets) and the path to the manual annotations file, relative
# to the ingest directory
## TODO see if the `annotations` key can be refactored out into the
## Snakemake file with a `{virus}` wildcard
229e:
ncbi_taxon_id: "11137"
annotations: "config/229e/annotations.tsv"
nl63:
ncbi_taxon_id: "277944"
annotations: "config/nl63/annotations.tsv"
oc43:
ncbi_taxon_id: "31631"
annotations: "config/oc43/annotations.tsv"
hku1:
ncbi_taxon_id: "290028"
annotations: "config/hku1/annotations.tsv"

# Optional fields to add to the NCBI Datasets output
ncbi_dataset_fields: []

# Config parameters related to the curate pipeline, shared by all viruses
curate:
# URL pointed to public generalized geolocation rules
# For the Nextstrain team, this is currently
# 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/@/source-data/gisaid_geoLocationRules.tsv'
geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/@/source-data/gisaid_geoLocationRules.tsv"

# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: "config/geolocation_rules.tsv"

# List of field names to change in the format of <old_field_name>=<new_field_name>
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
Expand All @@ -51,8 +49,10 @@ curate:
Submitter Affiliation: institution
SRA Accessions: sra_accession
passage_type: passage_type

# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: ["date"]

# List of expected date formats that are present in the date fields provided above
# These date formats should use directives expected by datetime
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
Expand All @@ -61,6 +61,7 @@ curate:
- "%Y-%m"
- "%Y-%m-%d"
- "%Y-%m-%dT%H:%M:%SZ"

titlecase:
# Abbreviations not cast to titlecase, keeps uppercase
abbreviations: ["USA"]
Expand All @@ -85,20 +86,28 @@ curate:
- sur
- the
- y

# List of string fields to titlecase
fields: ["region", "country", "division", "location"]

# Metadata field that contains the list of authors associated with the sequence
authors_field: "authors"

# Default value to use if the authors field is empty
authors_default_value: "?"

# Name to use for the generated abbreviated authors field
abbr_authors_field: "abbr_authors"

# The ID field in the metadata to use to merge the manual annotations
annotations_id: "strain"

# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: "strain"

# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: "sequence"

# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns:
- strain
Expand Down
2 changes: 1 addition & 1 deletion ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ rule curate:
sequences_ndjson="data/{virus}/ncbi.ndjson",
# Change the geolocation_rules input path if you are removing the above two rules
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=lambda wildcards: config[wildcards.virus]["annotations"],
annotations="config/{virus}/annotations.tsv",
output:
metadata="results/{virus}/all_metadata.tsv",
sequences="results/{virus}/sequences.fasta",
Expand Down

0 comments on commit f59c45a

Please sign in to comment.