Add gene coverage columns during ingest workflow #36

nextstrain · Apr 24, 2024 · 4ee7ec5 · 4ee7ec5
2 parents afa97a4 + db300a5
commit 4ee7ec5
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 10 deletions.
diff --git a/ingest/bin/calculate-gene-converage-from-nextclade-translation.py b/ingest/bin/calculate-gene-converage-from-nextclade-translation.py
@@ -0,0 +1,54 @@
+#! /usr/bin/env python3
+
+import argparse
+from sys import stderr
+
+from Bio import SeqIO
+import re
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Calculate gene coverage from amino acid sequence. The output will be in TSV format to stdout.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--fasta",
+        type=str,
+        help="FASTA file of CDS translations from Nextclade.",
+    )
+    parser.add_argument(
+        "--out-col",
+        type=str,
+        default="gene_coverage",
+        help="Output column name.",
+    )
+    return parser.parse_args()
+
+
+def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
+    """
+    Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade.
+    """
+    print(f"genbank_accession\t{out_col}")
+    # Iterate over the sequences in the FASTA file
+    for record in SeqIO.parse(fasta, "fasta"):
+        sequence_id = record.id
+        sequence = str(record.seq)
+
+        # Calculate gene coverage
+        results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])",  sequence.upper())
+        gene_coverage = round(len(results) / len(sequence), 3)
+
+        # Print the results
+        print(f"{sequence_id}\t{gene_coverage}")
+
+
+def main():
+    args = parse_args()
+
+    calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -109,4 +109,12 @@ curate:
 nextclade:
   min_length: 1000 # E gene length is approximately 1400nt
   min_seed_cover: 0.1
-  nextclade_field: 'nextclade_subtype'
+  gene: ['E','NS1']
+  # Nextclade Fields to rename to metadata field names.
+  field_map:
+    seqName: genbank_accession # ID field used to merge annotations
+    clade: nextclade_subtype
+    alignmentStart: alignmentStart
+    alignmentEnd: alignmentEnd
+    coverage: genome_coverage
+    failedCdses: failedCdses
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -19,16 +19,23 @@ SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES)
 rule nextclade_denvX:
     """
     For each type, classify into the appropriate subtype
+    1. Capture the alignment
+    2. Capture the translations of gene(s) of interest
+
+    Note: If using --cds-selection, only the thoese genes are reported in the failedCdses column
     """
     input:
         sequences="results/sequences_{serotype}.fasta",
         dataset="../nextclade_data/{serotype}",
     output:
         nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv",
+        nextclade_alignment="results/aligned_{serotype}.fasta",
+        nextclade_translations=expand("data/translations/{{serotype}}/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]),
     threads: 4
     params:
         min_length=config["nextclade"]["min_length"],
         min_seed_cover=config["nextclade"]["min_seed_cover"],
+        output_translations = lambda wildcards: f"data/translations/{wildcards.serotype}/{{cds}}/seqs.gene.fasta",
     wildcard_constraints:
         serotype=SEROTYPE_CONSTRAINTS
     shell:
@@ -40,6 +47,8 @@ rule nextclade_denvX:
           --min-length {params.min_length} \
           --min-seed-cover {params.min_seed_cover} \
           --silent \
+          --output-fasta {output.nextclade_alignment} \
+          --output-translations {params.output_translations} \
           {input.sequences}
         """
 
@@ -48,19 +57,19 @@ rule concat_nextclade_subtype_results:
     Concatenate all the nextclade results for dengue subtype classification
     """
     input:
-        expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
+        nextclade_results_files = expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
     output:
         nextclade_subtypes="results/nextclade_subtypes.tsv",
     params:
-        id_field=config["curate"]["id_field"],
-        nextclade_field=config["nextclade"]["nextclade_field"],
+        input_nextclade_fields=",".join([f'{key}' for key, value in config["nextclade"]["field_map"].items()]),
+        output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()]),
     shell:
         """
-        echo "{params.id_field},{params.nextclade_field}" \
+        echo "{params.output_nextclade_fields}" \
         | tr ',' '\t' \
         > {output.nextclade_subtypes}
 
-        tsv-select -H -f "seqName,clade" {input} \
+        tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_results_files} \
         | awk 'NR>1 {{print}}' \
         >> {output.nextclade_subtypes}
         """
@@ -73,21 +82,78 @@ rule append_nextclade_columns:
         metadata="data/metadata_all.tsv",
         nextclade_subtypes="results/nextclade_subtypes.tsv",
     output:
-        metadata_all="results/metadata_all.tsv",
+        metadata_all="data/metadata_nextclade.tsv",
     params:
-        id_field=config["curate"]["id_field"],
-        nextclade_field=config["nextclade"]["nextclade_field"],
+        id_field=list(config["nextclade"]["field_map"].values())[0],
+        output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()][1:]),
     shell:
         """
         tsv-join -H \
             --filter-file {input.nextclade_subtypes} \
             --key-fields {params.id_field} \
-            --append-fields {params.nextclade_field} \
+            --append-fields {params.output_nextclade_fields} \
             --write-all ? \
             {input.metadata} \
         > {output.metadata_all}
         """
 
+rule calculate_gene_coverage:
+    """
+    Calculate the coverage of the gene of interest
+    """
+    input:
+        nextclade_translation="data/translations/{serotype}/{gene}/seqs.gene.fasta",
+    output:
+        gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv",
+    wildcard_constraints:
+        serotype=SEROTYPE_CONSTRAINTS,
+    shell:
+        """
+        python bin/calculate-gene-converage-from-nextclade-translation.py \
+          --fasta {input.nextclade_translation} \
+          --out-col {wildcards.gene}_coverage \
+          > {output.gene_coverage}
+        """
+
+rule aggregate_gene_coverage_by_gene:
+    """
+    Aggregate the gene coverage results by gene
+    """
+    input:
+        gene_coverage=expand("data/translations/{serotype}/{{gene}}/gene_coverage.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
+    output:
+        gene_coverage_all="results/{gene}/gene_coverage_all.tsv",
+    shell:
+        """
+        tsv-append -H {input.gene_coverage} > {output.gene_coverage_all}
+        """
+
+rule append_gene_coverage_columns:
+    """
+    Append the gene coverage results to the metadata
+    """
+    input:
+        metadata="data/metadata_nextclade.tsv",
+        gene_coverage=expand("results/{gene}/gene_coverage_all.tsv", gene=config["nextclade"]["gene"])
+    output:
+        metadata_all="results/metadata_all.tsv",
+    params:
+        id_field=config["curate"]["id_field"],
+    shell:
+        """
+        cp {input.metadata} {output.metadata_all}
+        for FILE in {input.gene_coverage}; do
+            tsv-join -H \
+                --filter-file $FILE \
+                --key-fields {params.id_field} \
+                --append-fields '*_coverage' \
+                --write-all ? \
+                {output.metadata_all} \
+            > results/temp_aggregate_gene_coverage.tsv
+            mv results/temp_aggregate_gene_coverage.tsv {output.metadata_all}
+        done
+        """
+
 rule split_metadata_by_serotype:
     """
     Split the metadata by serotype