Skip to content

Commit

Permalink
Add gene coverage columns during ingest workflow #36
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored Apr 24, 2024
2 parents afa97a4 + db300a5 commit 4ee7ec5
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 10 deletions.
54 changes: 54 additions & 0 deletions ingest/bin/calculate-gene-converage-from-nextclade-translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#! /usr/bin/env python3

import argparse
from sys import stderr

from Bio import SeqIO
import re


def parse_args():
parser = argparse.ArgumentParser(
description="Calculate gene coverage from amino acid sequence. The output will be in TSV format to stdout.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--fasta",
type=str,
help="FASTA file of CDS translations from Nextclade.",
)
parser.add_argument(
"--out-col",
type=str,
default="gene_coverage",
help="Output column name.",
)
return parser.parse_args()


def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
"""
Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade.
"""
print(f"genbank_accession\t{out_col}")
# Iterate over the sequences in the FASTA file
for record in SeqIO.parse(fasta, "fasta"):
sequence_id = record.id
sequence = str(record.seq)

# Calculate gene coverage
results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])", sequence.upper())
gene_coverage = round(len(results) / len(sequence), 3)

# Print the results
print(f"{sequence_id}\t{gene_coverage}")


def main():
args = parse_args()

calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col)


if __name__ == "__main__":
main()
10 changes: 9 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,12 @@ curate:
nextclade:
min_length: 1000 # E gene length is approximately 1400nt
min_seed_cover: 0.1
nextclade_field: 'nextclade_subtype'
gene: ['E','NS1']
# Nextclade Fields to rename to metadata field names.
field_map:
seqName: genbank_accession # ID field used to merge annotations
clade: nextclade_subtype
alignmentStart: alignmentStart
alignmentEnd: alignmentEnd
coverage: genome_coverage
failedCdses: failedCdses
84 changes: 75 additions & 9 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,23 @@ SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES)
rule nextclade_denvX:
"""
For each type, classify into the appropriate subtype
1. Capture the alignment
2. Capture the translations of gene(s) of interest
Note: If using --cds-selection, only the thoese genes are reported in the failedCdses column
"""
input:
sequences="results/sequences_{serotype}.fasta",
dataset="../nextclade_data/{serotype}",
output:
nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv",
nextclade_alignment="results/aligned_{serotype}.fasta",
nextclade_translations=expand("data/translations/{{serotype}}/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]),
threads: 4
params:
min_length=config["nextclade"]["min_length"],
min_seed_cover=config["nextclade"]["min_seed_cover"],
output_translations = lambda wildcards: f"data/translations/{wildcards.serotype}/{{cds}}/seqs.gene.fasta",
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS
shell:
Expand All @@ -40,6 +47,8 @@ rule nextclade_denvX:
--min-length {params.min_length} \
--min-seed-cover {params.min_seed_cover} \
--silent \
--output-fasta {output.nextclade_alignment} \
--output-translations {params.output_translations} \
{input.sequences}
"""

Expand All @@ -48,19 +57,19 @@ rule concat_nextclade_subtype_results:
Concatenate all the nextclade results for dengue subtype classification
"""
input:
expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
nextclade_results_files = expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
output:
nextclade_subtypes="results/nextclade_subtypes.tsv",
params:
id_field=config["curate"]["id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
input_nextclade_fields=",".join([f'{key}' for key, value in config["nextclade"]["field_map"].items()]),
output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()]),
shell:
"""
echo "{params.id_field},{params.nextclade_field}" \
echo "{params.output_nextclade_fields}" \
| tr ',' '\t' \
> {output.nextclade_subtypes}
tsv-select -H -f "seqName,clade" {input} \
tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_results_files} \
| awk 'NR>1 {{print}}' \
>> {output.nextclade_subtypes}
"""
Expand All @@ -73,21 +82,78 @@ rule append_nextclade_columns:
metadata="data/metadata_all.tsv",
nextclade_subtypes="results/nextclade_subtypes.tsv",
output:
metadata_all="results/metadata_all.tsv",
metadata_all="data/metadata_nextclade.tsv",
params:
id_field=config["curate"]["id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
id_field=list(config["nextclade"]["field_map"].values())[0],
output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()][1:]),
shell:
"""
tsv-join -H \
--filter-file {input.nextclade_subtypes} \
--key-fields {params.id_field} \
--append-fields {params.nextclade_field} \
--append-fields {params.output_nextclade_fields} \
--write-all ? \
{input.metadata} \
> {output.metadata_all}
"""

rule calculate_gene_coverage:
"""
Calculate the coverage of the gene of interest
"""
input:
nextclade_translation="data/translations/{serotype}/{gene}/seqs.gene.fasta",
output:
gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv",
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS,
shell:
"""
python bin/calculate-gene-converage-from-nextclade-translation.py \
--fasta {input.nextclade_translation} \
--out-col {wildcards.gene}_coverage \
> {output.gene_coverage}
"""

rule aggregate_gene_coverage_by_gene:
"""
Aggregate the gene coverage results by gene
"""
input:
gene_coverage=expand("data/translations/{serotype}/{{gene}}/gene_coverage.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
output:
gene_coverage_all="results/{gene}/gene_coverage_all.tsv",
shell:
"""
tsv-append -H {input.gene_coverage} > {output.gene_coverage_all}
"""

rule append_gene_coverage_columns:
"""
Append the gene coverage results to the metadata
"""
input:
metadata="data/metadata_nextclade.tsv",
gene_coverage=expand("results/{gene}/gene_coverage_all.tsv", gene=config["nextclade"]["gene"])
output:
metadata_all="results/metadata_all.tsv",
params:
id_field=config["curate"]["id_field"],
shell:
"""
cp {input.metadata} {output.metadata_all}
for FILE in {input.gene_coverage}; do
tsv-join -H \
--filter-file $FILE \
--key-fields {params.id_field} \
--append-fields '*_coverage' \
--write-all ? \
{output.metadata_all} \
> results/temp_aggregate_gene_coverage.tsv
mv results/temp_aggregate_gene_coverage.tsv {output.metadata_all}
done
"""

rule split_metadata_by_serotype:
"""
Split the metadata by serotype
Expand Down

0 comments on commit 4ee7ec5

Please sign in to comment.