Skip to content

Commit

Permalink
[wip] Use "accession" column as ID column directly
Browse files Browse the repository at this point in the history
New options in Augur X.X.X allow usage of this column as the ID column
across all subcommands that read metadata.

For this workflow in particular, the metadata file can now be used
as-is. This removes the need for a modified copy of the metadata. It
also allows specifying an original metadata column "strain" as the
display strain field, rather than a column "strain_original" generated
during the Snakemake workflow.
  • Loading branch information
victorlin committed Jul 27, 2023
1 parent d49ac87 commit 0b9059e
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 22 deletions.
4 changes: 3 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ from packaging import version
from augur.__version__ import __version__ as augur_version
import sys

min_augur_version = "16.0.0"
# FIXME: Update version once Augur changes¹ are released.
# ¹ https://github.com/nextstrain/augur/commit/9bf1b4ce4169b2a10ea974608188c2fdd918b9bc
min_augur_version = "22.1.0"
if version.parse(augur_version) < version.parse(min_augur_version):
print("This pipeline needs a newer version of augur than you currently have...")
print(
Expand Down
2 changes: 1 addition & 1 deletion config/config_hmpxv1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv"
# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "hmpxv1"
auspice_name: "monkeypox_hmpxv1"
Expand Down
2 changes: 1 addition & 1 deletion config/config_hmpxv1_big.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv"
# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "hmpxv1_big"
auspice_name: "monkeypox_hmpxv1_big"
Expand Down
2 changes: 1 addition & 1 deletion config/config_mpxv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ tree_mask: "config/tree_mask.tsv"
# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "mpxv"
auspice_name: "monkeypox_mpxv"
Expand Down
3 changes: 2 additions & 1 deletion scripts/construct-recency-from-submission-date.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,11 @@ def get_recency(date_str, ref_date):
)

parser.add_argument('--metadata', type=str, required=True, help="metadata file")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--output', type=str, required=True, help="output json")
args = parser.parse_args()

meta = read_metadata(args.metadata).to_dict(orient="index")
meta = read_metadata(args.metadata, id_columns=args.metadata_id_columns).to_dict(orient="index")

node_data = {'nodes':{}}
ref_date = datetime.now()
Expand Down
6 changes: 4 additions & 2 deletions scripts/set_final_strain_name.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import json, argparse
from augur.io import read_metadata

def replace_name_recursive(node, lookup):
if node["name"] in lookup:
Expand All @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup):

parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
parser.add_argument('--metadata', type=str, required=True, help="input data")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
args = parser.parse_args()

metadata = pd.read_csv(args.metadata, sep='\t')
metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
name_lookup = {}
for ri, row in metadata.iterrows():
strain_id = row['strain']
strain_id = row.name
name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]

with open(args.input_auspice_json, 'r') as fh:
Expand Down
32 changes: 17 additions & 15 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
"""


rule wrangle_metadata:
input:
metadata="data/metadata.tsv",
output:
metadata="results/metadata.tsv",
params:
strain_id=config["strain_id_field"],
shell:
"""
csvtk -t rename -f strain -n strain_original {input.metadata} \
| csvtk mutate -t -f {params.strain_id} -n strain > {output.metadata}
"""


rule exclude_bad:
message:
"""
Expand All @@ -37,7 +23,7 @@ rule exclude_bad:
"""
input:
sequences="data/sequences.fasta",
metadata="results/metadata.tsv",
metadata="data/metadata.tsv",
exclude=config["exclude"],
output:
sequences=build_dir + "/{build_name}/good_sequences.fasta",
Expand All @@ -46,11 +32,13 @@ rule exclude_bad:
params:
min_date=config["min_date"],
min_length=config["min_length"],
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
Expand Down Expand Up @@ -93,11 +81,13 @@ rule filter:
group_by=config.get("group_by", "--group-by clade lineage"),
sequences_per_group=config["sequences_per_group"],
other_filters=config.get("filters", ""),
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--include {input.include} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
Expand Down Expand Up @@ -246,12 +236,14 @@ rule refine:
clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}"
if "clock_std_dev" in config
else "",
strain_id=config["strain_id_field"],
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--timetree \
--root {params.root} \
Expand Down Expand Up @@ -320,11 +312,13 @@ rule traits:
params:
columns="country",
sampling_bias_correction=3,
strain_id=config["strain_id_field"],
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output.node_data} \
--columns {params.columns} \
--confidence \
Expand Down Expand Up @@ -400,10 +394,13 @@ rule recency:
metadata=build_dir + "/{build_name}/metadata.tsv",
output:
node_data=build_dir + "/{build_name}/recency.json",
params:
strain_id=config["strain_id_field"],
shell:
"""
python3 scripts/construct-recency-from-submission-date.py \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output} 2>&1
"""

Expand Down Expand Up @@ -447,11 +444,14 @@ rule export:
output:
auspice_json=build_dir + "/{build_name}/raw_tree.json",
root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json",
params:
strain_id=config["strain_id_field"],
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.mutation_context} {input.clades} {input.recency}\
--colors {input.colors} \
--lat-longs {input.lat_longs} \
Expand All @@ -471,10 +471,12 @@ rule final_strain_name:
auspice_json=build_dir + "/{build_name}/tree.json",
root_sequence=build_dir + "/{build_name}/tree_root-sequence.json",
params:
strain_id=config["strain_id_field"],
display_strain_field=config.get("display_strain_field", "strain"),
shell:
"""
python3 scripts/set_final_strain_name.py --metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
Expand Down

0 comments on commit 0b9059e

Please sign in to comment.