Skip to content

Commit

Permalink
Merge pull request #18 from nextstrain/james/dedup-segments
Browse files Browse the repository at this point in the history
Group segments by strains
  • Loading branch information
jameshadfield authored Oct 11, 2024
2 parents 357dbfe + 091dc7f commit 06b3e6e
Show file tree
Hide file tree
Showing 29 changed files with 5,423 additions and 2,347 deletions.
21 changes: 5 additions & 16 deletions .github/workflows/phylogenetic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,10 @@ on:
If set, builds will be deployed to s3://nextstrain-staging/oropouche_trials_<trial_name>_*
required: false
type: string
sequences_url:
ingest_url_prefix:
description: |
URL for a sequences.fasta.zst file.
If not provided, will use default sequences_url from phylogenetic/defaults/config.yaml
required: false
type: string
metadata_url:
description: |
URL for a metadata.tsv.zst file.
If not provided, will use default metadata_url from phylogenetic/defaults/config.yaml
Location of the ingested data - we will add on prefixes of 'metadata.tsv.zst',
'S/sequences.fasta.zst' etc
required: false
type: string

Expand All @@ -51,8 +45,7 @@ jobs:
name: Set config overrides
env:
TRIAL_NAME: ${{ inputs.trial_name }}
SEQUENCES_URL: ${{ inputs.sequences_url }}
METADATA_URL: ${{ inputs.metadata_url }}
INGEST_URL_PREFIX: ${{ inputs.ingest_url_prefix }}
run: |
config=""
Expand All @@ -61,11 +54,7 @@ jobs:
fi
if [[ "$SEQUENCES_URL" ]]; then
config+=" sequences_url='"$SEQUENCES_URL"'"
fi
if [[ "$METADATA_URL" ]]; then
config+=" metadata_url='"$METADATA_URL"'"
config+=" ingest_url_prefix='"$INGEST_URL_PREFIX"'"
fi
if [[ $config ]]; then
Expand Down
8 changes: 6 additions & 2 deletions ingest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@ nextstrain build .

This produces the default outputs of the ingest workflow:

- metadata = results/metadata.tsv
- sequences = results/sequences.fasta
```
results/metadata.tsv
results/S/sequences.fasta
results/M/sequences.fasta
results/L/sequences.fasta
```

### Dumping the full raw metadata from NCBI Datasets

Expand Down
4 changes: 2 additions & 2 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ wildcard_constraints:
rule all:
input:
sequences=expand("results/{segment}/sequences.fasta", segment=segments),
metadata=expand("results/{segment}/metadata.tsv", segment=segments),
metadata_all="results/all/metadata.tsv",
metadata="results/metadata.tsv",


# Note that only PATHOGEN-level customizations should be added to these
Expand All @@ -35,6 +34,7 @@ rule all:
include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"
include: "rules/nextclade.smk"
include: "rules/group_segments.smk"


# We are pushing to standardize ingest workflows with Nextclade runs to include
Expand Down
10 changes: 4 additions & 6 deletions ingest/build-configs/nextstrain-automation/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@ cloudfront_domain: "data.nextstrain.org"
# Replace <pathogen> with the pathogen repo name.
s3_dst: "s3://nextstrain-data/files/workflows/oropouche"

# Mapping of files to upload
# Mapping of files to upload.
# Keys here are the target S3 key (after combining with "s3_dst")
# Values are the local files produced by the workflow
files_to_upload:
ncbi.ndjson.zst: data/ncbi.ndjson
all/metadata.tsv.zst: results/all/metadata.tsv
all/sequences.fasta.zst: results/all/sequences.fasta
L/metadata.tsv.zst: results/L/metadata.tsv
metadata.tsv.zst: results/metadata.tsv
L/sequences.fasta.zst: results/L/sequences.fasta
M/metadata.tsv.zst: results/M/metadata.tsv
M/sequences.fasta.zst: results/M/sequences.fasta
S/metadata.tsv.zst: results/S/metadata.tsv
S/sequences.fasta.zst: results/S/sequences.fasta
24 changes: 23 additions & 1 deletion ingest/defaults/annotations.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
# id to match existing metadata, field name, and field value
# If there are multiple annotations for the same id and field, then the last value is used
# Lines starting with '#' are treated as comments
# Any '#' after the field value are treated as comments.
# Any '#' after the field value are treated as comments.

# NOTE: Here we use accession as the ID, however using strain name would be better going forward as it would reduce
the duplication needed in the current format. We can't (currently) do this in oropouche because strain names are
added _after_ the curate chain runs.

PP952119 region North America # strain IRCCS-SCDC_1/2024 from traveler, L segment
PP952119 country Cuba # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11212459/
PP952119 date 2024-06-11
Expand All @@ -13,3 +18,20 @@ PP952118 date 2024-06-11
PP952117 region North America # strain IRCCS-SCDC_1/2024 from traveler, S segment
PP952117 country Cuba # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11212459/
PP952117 date 2024-06-11

# Strain 'H498913', 'date' had 2 observed values: HQ830423, HQ830388: 1988-XX-XX; HQ830457: 1990-XX-XX
HQ830457 date 1988-XX-XX

# When grouped by strain these segments have similar (but different) authors - we change them to the most complete author list
PP477303 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477315 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477304 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477316 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477305 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477317 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477306 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477318 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477307 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477319 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477308 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477320 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
16 changes: 15 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ curate:
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: "sequence"
# The list of metadata columns to keep in the final output of the curation pipeline.
# (We do not export 'strain' here as that's added in separately via a ENTREZ call)
metadata_columns: [
"accession",
"accession_version",
"strain",
"date",
"region",
"country",
Expand All @@ -123,3 +123,17 @@ curate:
nextclade:
segment_reference: "../shared/oropouche_{segment}.fasta"
min_seed_cover: 0.01


grouping:
common_strain_fields:
- date
- region
- country
- division
- location
- host
- authors
- abbr_authors
- institution
resolutions: defaults/segment_resolutions.yaml
45 changes: 45 additions & 0 deletions ingest/defaults/segment_resolutions.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
- strain: TRVL9760
accession: KP026181 # matches the metadata for the other segments for this strain
segment: S
- strain: BeAn19991
accession: KP052851
segment: M
- strain: BeAn19991
accession: KP052852
segment: S
- strain: BeAn_626990
accession: MG747521
segment: S
- strain: BeH_543629
accession: MG747572
segment: S
- strain: BeH_543857
accession: MG747578
segment: S
- strain: BeAn_206119
accession: MG747539
segment: S
- strain: BeH_543760
accession: MG747575
segment: S
- strain: BeAn_208402
accession: MG747542
segment: S
- strain: BeAn_208819
accession: MG747545
segment: S
- strain: PPS_523_H_669315
accession: MG747584
segment: S
- strain: PPS_522_H_669314
accession: MG747581
segment: S
- strain: PMOH_682426
accession: MG747587
segment: S
- strain: PMOH_682431
accession: MG747590
segment: S
- strain: BeH505764 # Note: strain is dropped by phylo exclude rule
accession: PP357050
segment: S
25 changes: 6 additions & 19 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ rule curate:
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["curate"]["annotations"],
output:
metadata="data/all_metadata.tsv",
sequences="results/all/sequences.fasta",
metadata="data/metadata_curated.tsv",
sequences="data/sequences.fasta",
log:
"logs/curate.txt",
benchmark:
Expand Down Expand Up @@ -116,28 +116,15 @@ rule curate:
--output-seq-field {params.sequence_field} ) 2>> {log}
"""


rule replace_strain_names:
input:
metadata="data/all_metadata.tsv",
strains = "data/strain-names.tsv"
output:
metadata="data/all_metadata_with_strains.tsv",
shell:
"""
tsv-select -H --exclude strain {input.metadata} | \
tsv-join -H --filter-file {input.strains} --key-fields accession --append-fields strain > {output.metadata}
"""

rule subset_metadata:
rule subset_curated_metadata_columns:
input:
metadata="data/all_metadata_with_strains.tsv",
metadata="data/metadata_curated.tsv",
output:
metadata="results/all/metadata.tsv",
metadata="data/metadata_subset.tsv",
params:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
"""
r"""
tsv-select -H -f {params.metadata_fields} \
{input.metadata} > {output.metadata}
"""
2 changes: 1 addition & 1 deletion ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ rule entrez_via_accessions:
accessions
"""
input:
metadata="data/all_metadata.tsv",
metadata="data/metadata_curated.tsv",
output:
genbank="data/genbank.gb",
benchmark:
Expand Down
43 changes: 43 additions & 0 deletions ingest/rules/group_segments.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

rule group_segments:
input:
metadata="data/metadata_merged.tsv",
resolutions=config["grouping"]["resolutions"],
output:
metadata="results/metadata.tsv"
params:
common_strain_fields = config["grouping"]["common_strain_fields"],
segments = segments,
shell:
r"""
python3 scripts/group_segments.py \
--metadata {input.metadata} \
--common-strain-fields {params.common_strain_fields} \
--segments {params.segments} \
--resolutions {input.resolutions} \
--output-metadata {output.metadata}
"""

rule subset_sequences_by_segment:
input:
metadata = "results/metadata.tsv",
sequences = "data/sequences.fasta",
output:
kv_map = temp("data/kv-map_{segment}.tsv"),
sequences = "results/{segment}/sequences.fasta",
params:
columns = lambda w: f"accession_{w.segment},strain",
filter_exp = lambda w: f"len($accession_{w.segment})>0",
drop_key = "__DROP__",
shell:
r"""
cat results/metadata.tsv \
| csvtk cut -t -f {params.columns} \
| csvtk filter2 -t -U -f {params.filter_exp:q} \
> {output.kv_map} && \
seqkit replace \
-p "(.*)" --replacement "{{kv}}" --kv-file {output.kv_map} -m {params.drop_key} \
{input.sequences} \
| seqkit grep -v -r -p '^{params.drop_key}$' \
> {output.sequences}
"""
48 changes: 35 additions & 13 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,35 +20,57 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html

rule run_nextclade_to_identify_segment:
input:
sequences = "results/all/sequences.fasta",
sequences = "data/sequences.fasta",
segment_reference = config["nextclade"]["segment_reference"],
output:
sequences = "results/{segment}/sequences.fasta",
nextclade = temp("data/nextclade_{segment}.tsv"),
params:
min_seed_cover = config["nextclade"]["min_seed_cover"],
shell:
"""
r"""
nextclade run \
--input-ref {input.segment_reference} \
--output-fasta {output.sequences} \
--output-tsv {output.nextclade} \
--min-seed-cover {params.min_seed_cover} \
--silent \
{input.sequences}
"""

rule subset_metadata_by_segment:
rule parse_nextclade_tsv:
input:
metadata = "results/all/metadata.tsv",
sequences = "results/{segment}/sequences.fasta",
nextclade = "data/nextclade_{segment}.tsv",
output:
metadata = "results/{segment}/metadata.tsv",
summary = "data/nextclade_{segment}_summary.tsv",
params:
strain_id_field = config["curate"]["output_id_field"],
nextclade_cols = 'seqName,qc.overallStatus',
new_cols = lambda w: f'accession,qc_{w.segment}',
mutate_exp = lambda w: f'len($qc_{w.segment})>0 ? "1" : "0"',
segment_col = lambda w: f'segment_{w.segment}',
shell:
r"""
csvtk cut -t -H -f {params.nextclade_cols:q} {input.nextclade:q} \
| csvtk rename -t -f {params.nextclade_cols:q} -n {params.new_cols:q} \
| csvtk mutate2 -t -n {params.segment_col:q} --at 2 -e {params.mutate_exp:q} \
> {output.summary:q}
echo "Nextclade aligned $(( $(cat {output.summary} | csvtk grep -t -f {params.segment_col} -p '1' -U | wc -l) ))/$(( $(wc -l < {input.nextclade}) -1 )) sequences to segment {wildcards.segment}"
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id_field} \


rule merge_metadata:
input:
strain="data/strain-names.tsv",
main="data/metadata_subset.tsv",
segments=expand("data/nextclade_{segment}_summary.tsv", segment=segments),
output:
metadata="data/metadata_merged.tsv",
params:
# augur merge requires NAME=FILEPATH argments, so we transform the inputs here:
segments = lambda w,input: " ".join([f"s_{idx}={s}" for idx,s in enumerate(input.segments)])
shell:
r"""
augur merge \
--metadata strains={input.strain} main={input.main} {params.segments} \
--metadata-id-columns accession \
--output-metadata {output.metadata}
"""
Loading

0 comments on commit 06b3e6e

Please sign in to comment.