From f59c45a373f923b0315232cb505bd1d4a0be5641 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Tue, 21 May 2024 12:41:15 -0700 Subject: [PATCH] Refactor `annotations` out of config, into rule [#8] --- ingest/config/defaults.yaml | 21 +++++++++++++++------ ingest/rules/curate.smk | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index f2fc52c..8439b4e 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -11,31 +11,29 @@ viruses: # for each virus, give the NCBI taxon ID (required for data fetch from # NCBI datasets) and the path to the manual annotations file, relative # to the ingest directory -## TODO see if the `annotations` key can be refactored out into the -## Snakemake file with a `{virus}` wildcard 229e: ncbi_taxon_id: "11137" - annotations: "config/229e/annotations.tsv" nl63: ncbi_taxon_id: "277944" - annotations: "config/nl63/annotations.tsv" oc43: ncbi_taxon_id: "31631" - annotations: "config/oc43/annotations.tsv" hku1: ncbi_taxon_id: "290028" - annotations: "config/hku1/annotations.tsv" + # Optional fields to add to the NCBI Datasets output ncbi_dataset_fields: [] + # Config parameters related to the curate pipeline, shared by all viruses curate: # URL pointed to public generalized geolocation rules # For the Nextstrain team, this is currently # 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/@/source-data/gisaid_geoLocationRules.tsv' geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/@/source-data/gisaid_geoLocationRules.tsv" + # The path to the local geolocation rules within the pathogen repo # The path should be relative to the ingest directory. local_geolocation_rules: "config/geolocation_rules.tsv" + # List of field names to change in the format of = # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names @@ -51,8 +49,10 @@ curate: Submitter Affiliation: institution SRA Accessions: sra_accession passage_type: passage_type + # List of date fields to standardize to ISO format YYYY-MM-DD date_fields: ["date"] + # List of expected date formats that are present in the date fields provided above # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes @@ -61,6 +61,7 @@ curate: - "%Y-%m" - "%Y-%m-%d" - "%Y-%m-%dT%H:%M:%SZ" + titlecase: # Abbreviations not cast to titlecase, keeps uppercase abbreviations: ["USA"] @@ -85,20 +86,28 @@ curate: - sur - the - y + # List of string fields to titlecase fields: ["region", "country", "division", "location"] + # Metadata field that contains the list of authors associated with the sequence authors_field: "authors" + # Default value to use if the authors field is empty authors_default_value: "?" + # Name to use for the generated abbreviated authors field abbr_authors_field: "abbr_authors" + # The ID field in the metadata to use to merge the manual annotations annotations_id: "strain" + # The ID field in the metadata to use as the sequence id in the output FASTA file output_id_field: "strain" + # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: "sequence" + # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: - strain diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index d4e4faf..b9cd63a 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -65,7 +65,7 @@ rule curate: sequences_ndjson="data/{virus}/ncbi.ndjson", # Change the geolocation_rules input path if you are removing the above two rules all_geolocation_rules="data/all-geolocation-rules.tsv", - annotations=lambda wildcards: config[wildcards.virus]["annotations"], + annotations="config/{virus}/annotations.tsv", output: metadata="results/{virus}/all_metadata.tsv", sequences="results/{virus}/sequences.fasta",