From c5b6c0fa7289a33c6f496d57aa059a5f6ad61976 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Thu, 22 Feb 2024 15:16:52 -0800
Subject: [PATCH] Document subsampling logic

Adds comments to the phylogenetic build configuration's "subsampling"
section to explain how the section's named groups translate to
individual `augur filter` commands per group and a final `augur filter`
command that aggregates the strain lists from each named group. This
documentation may soon be made redundant by official documentation on
docs.nextstrain.org, but for now it may be useful for users.
---
 phylogenetic/config/defaults.yaml | 48 +++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/phylogenetic/config/defaults.yaml b/phylogenetic/config/defaults.yaml
index 8825455..7587660 100644
--- a/phylogenetic/config/defaults.yaml
+++ b/phylogenetic/config/defaults.yaml
@@ -4,6 +4,54 @@ strain_id_field: "accession"
 #subsampling:
   #all: --min-length '9800' --query "country == 'USA' & accession != 'NC_009942'"
 
+# Define named subsampling groups below (e.g., "state", "country", "region",
+# etc.). The workflow will run an `augur filter` command with the arguments
+# defined by each named group. Each `augur filter` command operates on all
+# available metadata and sequences and produces a text file containing the list
+# of strain names that passed the filters. The workflow will collect the union
+# of all strain names from the subsampling files and output the corresponding
+# subset of metadata and sequences that will be used to build the phylogeny.
+#
+# As an example, we could define two named subsampling groups like the
+# following:
+#
+# ```
+# subsampling:
+#   state: --query "division == 'WA'" --subsample-max-sequences 5000
+#   neighboring_state: --query "division in ['CA', 'ID', 'OR', 'NV']" --subsample-max-sequences 5000
+# ```
+#
+# These named subsampling groups will translate to the following two `augur filter` commands:
+#
+# ```
+# augur filter \
+#   --sequences data/sequences_all.fasta \
+#   --metadata data/metadata_all.tsv \
+#   --query "division == 'WA'" --subsample-max-sequences 5000 \
+#   --output-strains results/subsampled_strains_state.txt
+#
+# augur filter \
+#   --sequences data/sequences_all.fasta \
+#   --metadata data/metadata_all.tsv \
+#   --query "division in ['CA', 'ID', 'OR', 'NV']" --subsample-max-sequences 5000 \
+#   --output-strains results/subsampled_strains_neighboring_state.txt
+# ```
+#
+# Then, the workflow will collect the strains from each command to extract the
+# corresponding metadata and sequences with the following command:
+#
+# ```
+# augur filter \
+#   --sequences data/sequences_all.fasta \
+#   --metadata data/metadata_all.tsv \
+#   --exclude-all \
+#   --include results/subsampled_strains_state.txt results/subsampled_strains_neighboring_state.txt \
+#   --output-sequences results/sequences_filtered.fasta \
+#   --output-metadata results/metadata_filtered.tsv
+# ```
+#
+# This command excludes all strains by default and then forces the inclusion of
+# the strains selected by the subsampling logic defined above.
 subsampling:
    state: --query "division == 'WA'" --min-length '9800' --subsample-max-sequences 5000
    neighboring_state: --query "division in ['CA', 'ID', 'OR', 'NV']" --group-by division year --min-length '9800' --subsample-max-sequences 5000