Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update usage of "accession" as the ID column #161

Merged
merged 4 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ from packaging import version
from augur.__version__ import __version__ as augur_version
import sys

min_augur_version = "16.0.0"
min_augur_version = "22.2.0"
if version.parse(augur_version) < version.parse(min_augur_version):
print("This pipeline needs a newer version of augur than you currently have...")
print(
Expand Down Expand Up @@ -37,8 +37,8 @@ rule all:

rule rename:
input:
auspice_json=build_dir + f"/{config.get('build_name')}/tree.json",
root_sequence=build_dir + f"/{config.get('build_name')}/tree_root-sequence.json",
auspice_json=build_dir + f"/{config['build_name']}/tree.json",
root_sequence=build_dir + f"/{config['build_name']}/tree_root-sequence.json",
output:
auspice_json=auspice_dir + f"/{config.get('auspice_name','tree')}.json",
root_sequence_json=auspice_dir
Expand All @@ -63,7 +63,7 @@ include: "workflow/snakemake_rules/chores.smk"
include: "workflow/snakemake_rules/core.smk"


if config.get("deploy_url"):
if config.get("deploy_url", False):

include: "workflow/snakemake_rules/nextstrain_automation.smk"

Expand Down
4 changes: 3 additions & 1 deletion config/config_hmpxv1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ auspice_config: "config/auspice_config_hmpxv1.json"
description: "config/description.md"
tree_mask: "config/tree_mask.tsv"

# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "hmpxv1"
auspice_name: "monkeypox_hmpxv1"
Expand Down
4 changes: 3 additions & 1 deletion config/config_hmpxv1_big.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ auspice_config: "config/auspice_config_hmpxv1_big.json"
description: "config/description.md"
tree_mask: "config/tree_mask.tsv"

# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "hmpxv1_big"
auspice_name: "monkeypox_hmpxv1_big"
Expand Down
4 changes: 3 additions & 1 deletion config/config_mpxv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ description: "config/description.md"
clades: "config/clades.tsv"
tree_mask: "config/tree_mask.tsv"

# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/monkeypox/issues/33
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

build_name: "mpxv"
auspice_name: "monkeypox_mpxv"
Expand Down
6 changes: 3 additions & 3 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _get_all_targets(wildcards):
]
)

if config.get("trigger_rebuild"):
if config.get("trigger_rebuild", False):
all_targets.append("data/trigger/rebuild.done")

return all_targets
Expand All @@ -73,7 +73,7 @@ include: "workflow/snakemake_rules/transform.smk"
include: "workflow/snakemake_rules/nextclade.smk"


if config.get("upload"):
if config.get("upload", False):

include: "workflow/snakemake_rules/upload.smk"

Expand All @@ -83,6 +83,6 @@ if send_slack_notifications:
include: "workflow/snakemake_rules/slack_notifications.smk"


if config.get("trigger_rebuild"):
if config.get("trigger_rebuild", False):

include: "workflow/snakemake_rules/trigger_rebuild.smk"
2 changes: 2 additions & 0 deletions ingest/workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ rule align:
insertions="data/insertions.csv",
translations="data/translations.zip",
params:
# The lambda is used to deactivate automatic wildcard expansion.
# https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
translations=lambda w: "data/translations/{gene}.fasta",
threads: 4
shell:
Expand Down
3 changes: 2 additions & 1 deletion scripts/construct-recency-from-submission-date.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,11 @@ def get_recency(date_str, ref_date):
)

parser.add_argument('--metadata', type=str, required=True, help="metadata file")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--output', type=str, required=True, help="output json")
args = parser.parse_args()

meta = read_metadata(args.metadata).to_dict(orient="index")
meta = read_metadata(args.metadata, id_columns=args.metadata_id_columns).to_dict(orient="index")

node_data = {'nodes':{}}
ref_date = datetime.now()
Expand Down
6 changes: 4 additions & 2 deletions scripts/set_final_strain_name.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import json, argparse
from augur.io import read_metadata

def replace_name_recursive(node, lookup):
if node["name"] in lookup:
Expand All @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup):

parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
parser.add_argument('--metadata', type=str, required=True, help="input data")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
args = parser.parse_args()

metadata = pd.read_csv(args.metadata, sep='\t')
metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
name_lookup = {}
for ri, row in metadata.iterrows():
strain_id = row['strain']
strain_id = row.name
name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]

with open(args.input_auspice_json, 'r') as fh:
Expand Down
7 changes: 3 additions & 4 deletions workflow/snakemake_rules/chores.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ rule update_example_data:
- sets the subsampling size to 50
- includes the root (defined in config but hardcoded here)
- ensures all clades and lineages are accounted for using --group-by
- uses `accession` as the ID column since `strain` currently contains duplicates

TODO: Use `strain` as the ID column after https://github.com/nextstrain/monkeypox/issues/33 is done.
"""
message:
"Update example data"
Expand All @@ -17,11 +14,13 @@ rule update_example_data:
output:
sequences="example_data/sequences.fasta",
metadata="example_data/metadata.tsv",
params:
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--metadata {input.metadata} \
--metadata-id-columns accession \
--metadata-id-columns {params.strain_id} \
--sequences {input.sequences} \
--include-where strain=MK783032 strain=MK783030 \
--group-by clade lineage \
Expand Down
46 changes: 23 additions & 23 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
"""


rule wrangle_metadata:
input:
metadata="data/metadata.tsv",
output:
metadata="results/metadata.tsv",
params:
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
csvtk -t rename -f strain -n strain_original {input.metadata} \
| csvtk mutate -t -f {params.strain_id} -n strain > {output.metadata}
"""


rule exclude_bad:
message:
"""
Expand All @@ -37,7 +23,7 @@ rule exclude_bad:
"""
input:
sequences="data/sequences.fasta",
metadata="results/metadata.tsv",
metadata="data/metadata.tsv",
exclude=config["exclude"],
output:
sequences=build_dir + "/{build_name}/good_sequences.fasta",
Expand All @@ -46,11 +32,13 @@ rule exclude_bad:
params:
min_date=config["min_date"],
min_length=config["min_length"],
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
Expand Down Expand Up @@ -93,11 +81,13 @@ rule filter:
group_by=config.get("group_by", "--group-by clade lineage"),
sequences_per_group=config["sequences_per_group"],
other_filters=config.get("filters", ""),
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--include {input.include} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
Expand Down Expand Up @@ -227,7 +217,7 @@ rule refine:
Note: --use-fft was removed (temporarily) due to https://github.com/neherlab/treetime/issues/242
"""
input:
tree=lambda w: rules.fix_tree.output.tree
tree=rules.fix_tree.output.tree
if config["fix_tree"]
else rules.tree.output.tree,
alignment=build_dir + "/{build_name}/masked.fasta",
Expand All @@ -240,18 +230,20 @@ rule refine:
date_inference="marginal",
clock_filter_iqd=0,
root=config["root"],
clock_rate=lambda w: f"--clock-rate {config['clock_rate']}"
clock_rate=f"--clock-rate {config['clock_rate']}"
if "clock_rate" in config
else "",
clock_std_dev=lambda w: f"--clock-std-dev {config['clock_std_dev']}"
clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}"
if "clock_std_dev" in config
else "",
strain_id=config["strain_id_field"],
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--timetree \
--root {params.root} \
Expand Down Expand Up @@ -320,11 +312,13 @@ rule traits:
params:
columns="country",
sampling_bias_correction=3,
strain_id=config["strain_id_field"],
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output.node_data} \
--columns {params.columns} \
--confidence \
Expand Down Expand Up @@ -400,10 +394,13 @@ rule recency:
metadata=build_dir + "/{build_name}/metadata.tsv",
output:
node_data=build_dir + "/{build_name}/recency.json",
params:
strain_id=config["strain_id_field"],
shell:
"""
python3 scripts/construct-recency-from-submission-date.py \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output} 2>&1
"""

Expand Down Expand Up @@ -431,29 +428,30 @@ rule export:
input:
tree=rules.refine.output.tree,
metadata=build_dir + "/{build_name}/metadata.tsv",
branch_lengths=lambda w: "results/{build_name}/branch_lengths.json"
branch_lengths="results/{build_name}/branch_lengths.json"
if config.get("timetree", False)
else "results/{build_name}/branch_lengths_no_time.json",
traits=rules.traits.output.node_data,
nt_muts=rules.ancestral.output.node_data,
aa_muts=rules.translate.output.node_data,
clades=build_dir + "/{build_name}/clades.json",
mutation_context=rules.mutation_context.output.node_data,
recency=lambda w: rules.recency.output.node_data
if config.get("recency", False)
else [],
recency=rules.recency.output.node_data if config.get("recency", False) else [],
colors=rules.colors.output.colors,
lat_longs=config["lat_longs"],
description=config["description"],
auspice_config=config["auspice_config"],
output:
auspice_json=build_dir + "/{build_name}/raw_tree.json",
root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json",
params:
strain_id=config["strain_id_field"],
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.mutation_context} {input.clades} {input.recency}\
--colors {input.colors} \
--lat-longs {input.lat_longs} \
Expand All @@ -473,10 +471,12 @@ rule final_strain_name:
auspice_json=build_dir + "/{build_name}/tree.json",
root_sequence=build_dir + "/{build_name}/tree_root-sequence.json",
params:
display_strain_field=lambda w: config.get("display_strain_field", "strain"),
strain_id=config["strain_id_field"],
display_strain_field=config.get("display_strain_field", "strain"),
shell:
"""
python3 scripts/set_final_strain_name.py --metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
Expand Down