From 0b9059e8fcb9c68e4319ced05266f5c20623f790 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 17 Jul 2023 15:17:06 -0700 Subject: [PATCH] [wip] Use "accession" column as ID column directly New options in Augur X.X.X allow usage of this column as the ID column across all subcommands that read metadata. For this workflow in particular, the metadata file can now be used as-is. This removes the need for a modified copy of the metadata. It also allows specifying an original metadata column "strain" as the display strain field, rather than a column "strain_original" generated during the Snakemake workflow. --- Snakefile | 4 ++- config/config_hmpxv1.yaml | 2 +- config/config_hmpxv1_big.yaml | 2 +- config/config_mpxv.yaml | 2 +- .../construct-recency-from-submission-date.py | 3 +- scripts/set_final_strain_name.py | 6 ++-- workflow/snakemake_rules/core.smk | 32 ++++++++++--------- 7 files changed, 29 insertions(+), 22 deletions(-) diff --git a/Snakefile b/Snakefile index 98001979..8ec213aa 100644 --- a/Snakefile +++ b/Snakefile @@ -2,7 +2,9 @@ from packaging import version from augur.__version__ import __version__ as augur_version import sys -min_augur_version = "16.0.0" +# FIXME: Update version once Augur changes¹ are released. +# ¹ https://github.com/nextstrain/augur/commit/9bf1b4ce4169b2a10ea974608188c2fdd918b9bc +min_augur_version = "22.1.0" if version.parse(augur_version) < version.parse(min_augur_version): print("This pipeline needs a newer version of augur than you currently have...") print( diff --git a/config/config_hmpxv1.yaml b/config/config_hmpxv1.yaml index 33199c19..13c3a2d9 100644 --- a/config/config_hmpxv1.yaml +++ b/config/config_hmpxv1.yaml @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/monkeypox/issues/33 strain_id_field: "accession" -display_strain_field: "strain_original" +display_strain_field: "strain" build_name: "hmpxv1" auspice_name: "monkeypox_hmpxv1" diff --git a/config/config_hmpxv1_big.yaml b/config/config_hmpxv1_big.yaml index a2f0f295..d6a663f1 100644 --- a/config/config_hmpxv1_big.yaml +++ b/config/config_hmpxv1_big.yaml @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/monkeypox/issues/33 strain_id_field: "accession" -display_strain_field: "strain_original" +display_strain_field: "strain" build_name: "hmpxv1_big" auspice_name: "monkeypox_hmpxv1_big" diff --git a/config/config_mpxv.yaml b/config/config_mpxv.yaml index ec3b61a5..12a4e820 100644 --- a/config/config_mpxv.yaml +++ b/config/config_mpxv.yaml @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/monkeypox/issues/33 strain_id_field: "accession" -display_strain_field: "strain_original" +display_strain_field: "strain" build_name: "mpxv" auspice_name: "monkeypox_mpxv" diff --git a/scripts/construct-recency-from-submission-date.py b/scripts/construct-recency-from-submission-date.py index 4268dadf..24670b6d 100644 --- a/scripts/construct-recency-from-submission-date.py +++ b/scripts/construct-recency-from-submission-date.py @@ -38,10 +38,11 @@ def get_recency(date_str, ref_date): ) parser.add_argument('--metadata', type=str, required=True, help="metadata file") + parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--output', type=str, required=True, help="output json") args = parser.parse_args() - meta = read_metadata(args.metadata).to_dict(orient="index") + meta = read_metadata(args.metadata, id_columns=args.metadata_id_columns).to_dict(orient="index") node_data = {'nodes':{}} ref_date = datetime.now() diff --git a/scripts/set_final_strain_name.py b/scripts/set_final_strain_name.py index 0036f2a5..08ca9359 100644 --- a/scripts/set_final_strain_name.py +++ b/scripts/set_final_strain_name.py @@ -1,5 +1,6 @@ import pandas as pd import json, argparse +from augur.io import read_metadata def replace_name_recursive(node, lookup): if node["name"] in lookup: @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup): parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json") parser.add_argument('--metadata', type=str, required=True, help="input data") + parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice") parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") args = parser.parse_args() - metadata = pd.read_csv(args.metadata, sep='\t') + metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns) name_lookup = {} for ri, row in metadata.iterrows(): - strain_id = row['strain'] + strain_id = row.name name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name] with open(args.input_auspice_json, 'r') as fh: diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 8b14e81b..51c68124 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -13,20 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream. """ -rule wrangle_metadata: - input: - metadata="data/metadata.tsv", - output: - metadata="results/metadata.tsv", - params: - strain_id=config["strain_id_field"], - shell: - """ - csvtk -t rename -f strain -n strain_original {input.metadata} \ - | csvtk mutate -t -f {params.strain_id} -n strain > {output.metadata} - """ - - rule exclude_bad: message: """ @@ -37,7 +23,7 @@ rule exclude_bad: """ input: sequences="data/sequences.fasta", - metadata="results/metadata.tsv", + metadata="data/metadata.tsv", exclude=config["exclude"], output: sequences=build_dir + "/{build_name}/good_sequences.fasta", @@ -46,11 +32,13 @@ rule exclude_bad: params: min_date=config["min_date"], min_length=config["min_length"], + strain_id=config["strain_id_field"], shell: """ augur filter \ --sequences {input.sequences} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --exclude {input.exclude} \ --output-sequences {output.sequences} \ --output-metadata {output.metadata} \ @@ -93,11 +81,13 @@ rule filter: group_by=config.get("group_by", "--group-by clade lineage"), sequences_per_group=config["sequences_per_group"], other_filters=config.get("filters", ""), + strain_id=config["strain_id_field"], shell: """ augur filter \ --sequences {input.sequences} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --include {input.include} \ --output-sequences {output.sequences} \ --output-metadata {output.metadata} \ @@ -246,12 +236,14 @@ rule refine: clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}" if "clock_std_dev" in config else "", + strain_id=config["strain_id_field"], shell: """ augur refine \ --tree {input.tree} \ --alignment {input.alignment} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output-tree {output.tree} \ --timetree \ --root {params.root} \ @@ -320,11 +312,13 @@ rule traits: params: columns="country", sampling_bias_correction=3, + strain_id=config["strain_id_field"], shell: """ augur traits \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output {output.node_data} \ --columns {params.columns} \ --confidence \ @@ -400,10 +394,13 @@ rule recency: metadata=build_dir + "/{build_name}/metadata.tsv", output: node_data=build_dir + "/{build_name}/recency.json", + params: + strain_id=config["strain_id_field"], shell: """ python3 scripts/construct-recency-from-submission-date.py \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output {output} 2>&1 """ @@ -447,11 +444,14 @@ rule export: output: auspice_json=build_dir + "/{build_name}/raw_tree.json", root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json", + params: + strain_id=config["strain_id_field"], shell: """ augur export v2 \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.mutation_context} {input.clades} {input.recency}\ --colors {input.colors} \ --lat-longs {input.lat_longs} \ @@ -471,10 +471,12 @@ rule final_strain_name: auspice_json=build_dir + "/{build_name}/tree.json", root_sequence=build_dir + "/{build_name}/tree_root-sequence.json", params: + strain_id=config["strain_id_field"], display_strain_field=config.get("display_strain_field", "strain"), shell: """ python3 scripts/set_final_strain_name.py --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --input-auspice-json {input.auspice_json} \ --display-strain-name {params.display_strain_field} \ --output {output.auspice_json}