Skip to content

Commit

Permalink
Merge pull request #161: Update usage of "accession" as the ID column
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin authored Aug 1, 2023
2 parents 1ef49ba + 927ad6c commit d5dc3b2
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 40 deletions.
8 changes: 4 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ from packaging import version
from augur.__version__ import __version__ as augur_version
import sys

min_augur_version = "16.0.0"
min_augur_version = "22.2.0"
if version.parse(augur_version) < version.parse(min_augur_version):
print("This pipeline needs a newer version of augur than you currently have...")
print(
Expand Down Expand Up @@ -37,8 +37,8 @@ rule all:

rule rename:
input:
auspice_json=build_dir + f"/{config.get('build_name')}/tree.json",
root_sequence=build_dir + f"/{config.get('build_name')}/tree_root-sequence.json",
auspice_json=build_dir + f"/{config['build_name']}/tree.json",
root_sequence=build_dir + f"/{config['build_name']}/tree_root-sequence.json",
output:
auspice_json=auspice_dir + f"/{config.get('auspice_name','tree')}.json",
root_sequence_json=auspice_dir
Expand All @@ -63,7 +63,7 @@ include: "workflow/snakemake_rules/chores.smk"
include: "workflow/snakemake_rules/core.smk"


if config.get("deploy_url"):
if config.get("deploy_url", False):

include: "workflow/snakemake_rules/nextstrain_automation.smk"

Expand Down
4 changes: 3 additions & 1 deletion config/config_hmpxv1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ auspice_config: "config/auspice_config_hmpxv1.json"
description: "config/description.md"
tree_mask: "config/tree_mask.tsv"

# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "hmpxv1"
auspice_name: "monkeypox_hmpxv1"
Expand Down
4 changes: 3 additions & 1 deletion config/config_hmpxv1_big.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ auspice_config: "config/auspice_config_hmpxv1_big.json"
description: "config/description.md"
tree_mask: "config/tree_mask.tsv"

# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "hmpxv1_big"
auspice_name: "monkeypox_hmpxv1_big"
Expand Down
4 changes: 3 additions & 1 deletion config/config_mpxv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ description: "config/description.md"
clades: "config/clades.tsv"
tree_mask: "config/tree_mask.tsv"

# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "mpxv"
auspice_name: "monkeypox_mpxv"
Expand Down
6 changes: 3 additions & 3 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _get_all_targets(wildcards):
]
)

if config.get("trigger_rebuild"):
if config.get("trigger_rebuild", False):
all_targets.append("data/trigger/rebuild.done")

return all_targets
Expand All @@ -73,7 +73,7 @@ include: "workflow/snakemake_rules/transform.smk"
include: "workflow/snakemake_rules/nextclade.smk"


if config.get("upload"):
if config.get("upload", False):

include: "workflow/snakemake_rules/upload.smk"

Expand All @@ -83,6 +83,6 @@ if send_slack_notifications:
include: "workflow/snakemake_rules/slack_notifications.smk"


if config.get("trigger_rebuild"):
if config.get("trigger_rebuild", False):

include: "workflow/snakemake_rules/trigger_rebuild.smk"
2 changes: 2 additions & 0 deletions ingest/workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ rule align:
insertions="data/insertions.csv",
translations="data/translations.zip",
params:
# The lambda is used to deactivate automatic wildcard expansion.
# https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
translations=lambda w: "data/translations/{gene}.fasta",
threads: 4
shell:
Expand Down
3 changes: 2 additions & 1 deletion scripts/construct-recency-from-submission-date.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,11 @@ def get_recency(date_str, ref_date):
)

parser.add_argument('--metadata', type=str, required=True, help="metadata file")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--output', type=str, required=True, help="output json")
args = parser.parse_args()

meta = read_metadata(args.metadata).to_dict(orient="index")
meta = read_metadata(args.metadata, id_columns=args.metadata_id_columns).to_dict(orient="index")

node_data = {'nodes':{}}
ref_date = datetime.now()
Expand Down
6 changes: 4 additions & 2 deletions scripts/set_final_strain_name.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import json, argparse
from augur.io import read_metadata

def replace_name_recursive(node, lookup):
if node["name"] in lookup:
Expand All @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup):

parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
parser.add_argument('--metadata', type=str, required=True, help="input data")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
args = parser.parse_args()

metadata = pd.read_csv(args.metadata, sep='\t')
metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
name_lookup = {}
for ri, row in metadata.iterrows():
strain_id = row['strain']
strain_id = row.name
name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]

with open(args.input_auspice_json, 'r') as fh:
Expand Down
7 changes: 3 additions & 4 deletions workflow/snakemake_rules/chores.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ rule update_example_data:
- sets the subsampling size to 50
- includes the root (defined in config but hardcoded here)
- ensures all clades and lineages are accounted for using --group-by
- uses `accession` as the ID column since `strain` currently contains duplicates
TODO: Use `strain` as the ID column after https://github.com/nextstrain/monkeypox/issues/33 is done.
"""
message:
"Update example data"
Expand All @@ -17,11 +14,13 @@ rule update_example_data:
output:
sequences="example_data/sequences.fasta",
metadata="example_data/metadata.tsv",
params:
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--metadata {input.metadata} \
--metadata-id-columns accession \
--metadata-id-columns {params.strain_id} \
--sequences {input.sequences} \
--include-where strain=MK783032 strain=MK783030 \
--group-by clade lineage \
Expand Down
46 changes: 23 additions & 23 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
"""


rule wrangle_metadata:
input:
metadata="data/metadata.tsv",
output:
metadata="results/metadata.tsv",
params:
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
csvtk -t rename -f strain -n strain_original {input.metadata} \
| csvtk mutate -t -f {params.strain_id} -n strain > {output.metadata}
"""


rule exclude_bad:
message:
"""
Expand All @@ -37,7 +23,7 @@ rule exclude_bad:
"""
input:
sequences="data/sequences.fasta",
metadata="results/metadata.tsv",
metadata="data/metadata.tsv",
exclude=config["exclude"],
output:
sequences=build_dir + "/{build_name}/good_sequences.fasta",
Expand All @@ -46,11 +32,13 @@ rule exclude_bad:
params:
min_date=config["min_date"],
min_length=config["min_length"],
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
Expand Down Expand Up @@ -93,11 +81,13 @@ rule filter:
group_by=config.get("group_by", "--group-by clade lineage"),
sequences_per_group=config["sequences_per_group"],
other_filters=config.get("filters", ""),
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--include {input.include} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
Expand Down Expand Up @@ -227,7 +217,7 @@ rule refine:
Note: --use-fft was removed (temporarily) due to https://github.com/neherlab/treetime/issues/242
"""
input:
tree=lambda w: rules.fix_tree.output.tree
tree=rules.fix_tree.output.tree
if config["fix_tree"]
else rules.tree.output.tree,
alignment=build_dir + "/{build_name}/masked.fasta",
Expand All @@ -240,18 +230,20 @@ rule refine:
date_inference="marginal",
clock_filter_iqd=0,
root=config["root"],
clock_rate=lambda w: f"--clock-rate {config['clock_rate']}"
clock_rate=f"--clock-rate {config['clock_rate']}"
if "clock_rate" in config
else "",
clock_std_dev=lambda w: f"--clock-std-dev {config['clock_std_dev']}"
clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}"
if "clock_std_dev" in config
else "",
strain_id=config["strain_id_field"],
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--timetree \
--root {params.root} \
Expand Down Expand Up @@ -320,11 +312,13 @@ rule traits:
params:
columns="country",
sampling_bias_correction=3,
strain_id=config["strain_id_field"],
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output.node_data} \
--columns {params.columns} \
--confidence \
Expand Down Expand Up @@ -400,10 +394,13 @@ rule recency:
metadata=build_dir + "/{build_name}/metadata.tsv",
output:
node_data=build_dir + "/{build_name}/recency.json",
params:
strain_id=config["strain_id_field"],
shell:
"""
python3 scripts/construct-recency-from-submission-date.py \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output} 2>&1
"""

Expand Down Expand Up @@ -431,29 +428,30 @@ rule export:
input:
tree=rules.refine.output.tree,
metadata=build_dir + "/{build_name}/metadata.tsv",
branch_lengths=lambda w: "results/{build_name}/branch_lengths.json"
branch_lengths="results/{build_name}/branch_lengths.json"
if config.get("timetree", False)
else "results/{build_name}/branch_lengths_no_time.json",
traits=rules.traits.output.node_data,
nt_muts=rules.ancestral.output.node_data,
aa_muts=rules.translate.output.node_data,
clades=build_dir + "/{build_name}/clades.json",
mutation_context=rules.mutation_context.output.node_data,
recency=lambda w: rules.recency.output.node_data
if config.get("recency", False)
else [],
recency=rules.recency.output.node_data if config.get("recency", False) else [],
colors=rules.colors.output.colors,
lat_longs=config["lat_longs"],
description=config["description"],
auspice_config=config["auspice_config"],
output:
auspice_json=build_dir + "/{build_name}/raw_tree.json",
root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json",
params:
strain_id=config["strain_id_field"],
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.mutation_context} {input.clades} {input.recency}\
--colors {input.colors} \
--lat-longs {input.lat_longs} \
Expand All @@ -473,10 +471,12 @@ rule final_strain_name:
auspice_json=build_dir + "/{build_name}/tree.json",
root_sequence=build_dir + "/{build_name}/tree_root-sequence.json",
params:
display_strain_field=lambda w: config.get("display_strain_field", "strain"),
strain_id=config["strain_id_field"],
display_strain_field=config.get("display_strain_field", "strain"),
shell:
"""
python3 scripts/set_final_strain_name.py --metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
Expand Down

0 comments on commit d5dc3b2

Please sign in to comment.