Prototype metadata-only filtering

Builds on a prototype augur interface for metadata-only filtering [1] to speed up the ncov subsampling steps. Specifically, subsampling writes out lists of selected strain names to text files instead of writing out sequences. Then the "combine samples" step extracts sequences from the unique list of strain names with samtools faidx. The resulting workflow requires substantially less I/O and speeds up subsampling from an average of 163 seconds per job (for 12 jobs in a Nextstrain global build) to 36 seconds per job. One downside to the current metadata-only approach in Augur that reveals itself here is that its possible to have mismatches between metadata and sequences such that a strain has metadata but does not have a sequence in the filtered FASTA. One way around this issue is to export the filtered metadata earlier in the pipeline. For now, we use samtools faidx with the `-c` flag to ignore missing sequences. [1] nextstrain/augur#679
nextstrain · Feb 18, 2021 · a363be3 · a363be3
1 parent a9a9aa0
commit a363be3
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 11 deletions.
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
@@ -13,11 +13,12 @@ dependencies:
 - python=3.6*
 - nodejs=10
 - raxml
+- samtools
 - vcftools
 - git
 - pip
 - pip:
   - awscli==1.18.45
-  - git+https://github.com/nextstrain/augur@add-index-command
+  - git+https://github.com/nextstrain/augur@metadata-only-filter
   - nextstrain-cli==1.16.5
   - rethinkdb==2.3.0.post6
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -305,6 +305,25 @@ rule filter:
             --output {output.sequences} 2>&1 | tee {log}
         """
 
+rule fasta_index_filtered_sequences:
+    message:
+        """
+        samtools faidx filtered sequences.
+        """
+    input:
+        sequences = "results/filtered.fasta"
+    output:
+        sequence_index = "results/filtered.fasta.fai"
+    log:
+        "logs/samtools_faidx_sequences.txt"
+    benchmark:
+        "benchmarks/samtools_faidx_sequences.txt"
+    conda: config["conda_environment"]
+    shell:
+        """
+        samtools faidx {input.sequences}
+        """
+
 def _get_subsampling_settings(wildcards):
     # Allow users to override default subsampling with their own settings keyed
     # by location type and name. For example, "region_europe" or
@@ -401,14 +420,13 @@ rule subsample:
          - priority: {params.priority_argument}
         """
     input:
-        sequences = "results/filtered.fasta",
         sequence_index = "results/sequence_index.tsv",
         metadata = config["metadata"],
         include = config["files"]["include"],
         priorities = get_priorities,
         exclude = config["files"]["exclude"]
     output:
-        sequences = "results/{build_name}/sample-{subsample}.fasta"
+        strains = "results/{build_name}/sample-{subsample}.txt"
     log:
         "logs/subsample_{build_name}_{subsample}.txt"
     benchmark:
@@ -429,7 +447,6 @@ rule subsample:
     shell:
         """
         augur filter \
-            --sequences {input.sequences} \
             --sequence-index {input.sequence_index} \
             --metadata {input.metadata} \
             --include {input.include} \
@@ -445,7 +462,7 @@ rule subsample:
             {params.sequences_per_group} \
             {params.subsample_max_sequences} \
             {params.sampling_scheme} \
-            --output {output.sequences} 2>&1 | tee {log}
+            --output-strains {output.strains} 2>&1 | tee {log}
         """
 
 rule proximity_score:
@@ -481,27 +498,40 @@ def _get_subsampled_files(wildcards):
     subsampling_settings = _get_subsampling_settings(wildcards)
 
     return [
-        f"results/{wildcards.build_name}/sample-{subsample}.fasta"
+        f"results/{wildcards.build_name}/sample-{subsample}.txt"
         for subsample in subsampling_settings
     ]
 
+rule combine_sample_names:
+    input:
+        include=_get_subsampled_files
+    output:
+        strains = "results/{build_name}/subsampled_strains.txt"
+    log:
+        "logs/subsampled_strains_{build_name}.txt"
+    conda: config["conda_environment"]
+    shell:
+        """
+        sort -u {input.include} > {output}
+        """
+
 rule combine_samples:
     message:
         """
-        Combine and deduplicate FASTAs
+        Extract subsampled strains to a FASTA file.
         """
     input:
-        _get_subsampled_files
+        sequences = "results/filtered.fasta",
+        samtools_index = "results/filtered.fasta.fai",
+        strains = "results/{build_name}/subsampled_strains.txt"
     output:
         alignment = "results/{build_name}/subsampled_alignment.fasta"
     log:
         "logs/subsample_regions_{build_name}.txt"
     conda: config["conda_environment"]
     shell:
         """
-        python3 scripts/combine-and-dedup-fastas.py \
-            --input {input} \
-            --output {output} 2>&1 | tee {log}
+        samtools faidx -c -r {input.strains} -o {output.alignment} {input.sequences} 2>&1 | tee {log}
         """
 
 # TODO: This will probably not work for build names like "country_usa" where we need to know the country is "USA".