diff --git a/Snakefile b/Snakefile index 98001979..8ec213aa 100644 --- a/Snakefile +++ b/Snakefile @@ -2,7 +2,9 @@ from packaging import version from augur.__version__ import __version__ as augur_version import sys -min_augur_version = "16.0.0" +# FIXME: Update version once Augur changes¹ are released. +# ¹ https://github.com/nextstrain/augur/commit/9bf1b4ce4169b2a10ea974608188c2fdd918b9bc +min_augur_version = "22.1.0" if version.parse(augur_version) < version.parse(min_augur_version): print("This pipeline needs a newer version of augur than you currently have...") print( diff --git a/config/config_hmpxv1.yaml b/config/config_hmpxv1.yaml index 33199c19..13c3a2d9 100644 --- a/config/config_hmpxv1.yaml +++ b/config/config_hmpxv1.yaml @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/monkeypox/issues/33 strain_id_field: "accession" -display_strain_field: "strain_original" +display_strain_field: "strain" build_name: "hmpxv1" auspice_name: "monkeypox_hmpxv1" diff --git a/config/config_hmpxv1_big.yaml b/config/config_hmpxv1_big.yaml index a2f0f295..d6a663f1 100644 --- a/config/config_hmpxv1_big.yaml +++ b/config/config_hmpxv1_big.yaml @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/monkeypox/issues/33 strain_id_field: "accession" -display_strain_field: "strain_original" +display_strain_field: "strain" build_name: "hmpxv1_big" auspice_name: "monkeypox_hmpxv1_big" diff --git a/config/config_mpxv.yaml b/config/config_mpxv.yaml index ec3b61a5..12a4e820 100644 --- a/config/config_mpxv.yaml +++ b/config/config_mpxv.yaml @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/monkeypox/issues/33 strain_id_field: "accession" -display_strain_field: "strain_original" +display_strain_field: "strain" build_name: "mpxv" auspice_name: "monkeypox_mpxv" diff --git a/scripts/construct-recency-from-submission-date.py b/scripts/construct-recency-from-submission-date.py index 4268dadf..24670b6d 100644 --- a/scripts/construct-recency-from-submission-date.py +++ b/scripts/construct-recency-from-submission-date.py @@ -38,10 +38,11 @@ def get_recency(date_str, ref_date): ) parser.add_argument('--metadata', type=str, required=True, help="metadata file") + parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--output', type=str, required=True, help="output json") args = parser.parse_args() - meta = read_metadata(args.metadata).to_dict(orient="index") + meta = read_metadata(args.metadata, id_columns=args.metadata_id_columns).to_dict(orient="index") node_data = {'nodes':{}} ref_date = datetime.now() diff --git a/scripts/set_final_strain_name.py b/scripts/set_final_strain_name.py index 0036f2a5..08ca9359 100644 --- a/scripts/set_final_strain_name.py +++ b/scripts/set_final_strain_name.py @@ -1,5 +1,6 @@ import pandas as pd import json, argparse +from augur.io import read_metadata def replace_name_recursive(node, lookup): if node["name"] in lookup: @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup): parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json") parser.add_argument('--metadata', type=str, required=True, help="input data") + parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice") parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") args = parser.parse_args() - metadata = pd.read_csv(args.metadata, sep='\t') + metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns) name_lookup = {} for ri, row in metadata.iterrows(): - strain_id = row['strain'] + strain_id = row.name name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name] with open(args.input_auspice_json, 'r') as fh: diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 8b14e81b..51c68124 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -13,20 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream. """ -rule wrangle_metadata: - input: - metadata="data/metadata.tsv", - output: - metadata="results/metadata.tsv", - params: - strain_id=config["strain_id_field"], - shell: - """ - csvtk -t rename -f strain -n strain_original {input.metadata} \ - | csvtk mutate -t -f {params.strain_id} -n strain > {output.metadata} - """ - - rule exclude_bad: message: """ @@ -37,7 +23,7 @@ rule exclude_bad: """ input: sequences="data/sequences.fasta", - metadata="results/metadata.tsv", + metadata="data/metadata.tsv", exclude=config["exclude"], output: sequences=build_dir + "/{build_name}/good_sequences.fasta", @@ -46,11 +32,13 @@ rule exclude_bad: params: min_date=config["min_date"], min_length=config["min_length"], + strain_id=config["strain_id_field"], shell: """ augur filter \ --sequences {input.sequences} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --exclude {input.exclude} \ --output-sequences {output.sequences} \ --output-metadata {output.metadata} \ @@ -93,11 +81,13 @@ rule filter: group_by=config.get("group_by", "--group-by clade lineage"), sequences_per_group=config["sequences_per_group"], other_filters=config.get("filters", ""), + strain_id=config["strain_id_field"], shell: """ augur filter \ --sequences {input.sequences} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --include {input.include} \ --output-sequences {output.sequences} \ --output-metadata {output.metadata} \ @@ -246,12 +236,14 @@ rule refine: clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}" if "clock_std_dev" in config else "", + strain_id=config["strain_id_field"], shell: """ augur refine \ --tree {input.tree} \ --alignment {input.alignment} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output-tree {output.tree} \ --timetree \ --root {params.root} \ @@ -320,11 +312,13 @@ rule traits: params: columns="country", sampling_bias_correction=3, + strain_id=config["strain_id_field"], shell: """ augur traits \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output {output.node_data} \ --columns {params.columns} \ --confidence \ @@ -400,10 +394,13 @@ rule recency: metadata=build_dir + "/{build_name}/metadata.tsv", output: node_data=build_dir + "/{build_name}/recency.json", + params: + strain_id=config["strain_id_field"], shell: """ python3 scripts/construct-recency-from-submission-date.py \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output {output} 2>&1 """ @@ -447,11 +444,14 @@ rule export: output: auspice_json=build_dir + "/{build_name}/raw_tree.json", root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json", + params: + strain_id=config["strain_id_field"], shell: """ augur export v2 \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.mutation_context} {input.clades} {input.recency}\ --colors {input.colors} \ --lat-longs {input.lat_longs} \ @@ -471,10 +471,12 @@ rule final_strain_name: auspice_json=build_dir + "/{build_name}/tree.json", root_sequence=build_dir + "/{build_name}/tree_root-sequence.json", params: + strain_id=config["strain_id_field"], display_strain_field=config.get("display_strain_field", "strain"), shell: """ python3 scripts/set_final_strain_name.py --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --input-auspice-json {input.auspice_json} \ --display-strain-name {params.display_strain_field} \ --output {output.auspice_json}