From 5177fbc3471cf19f847e6e3607d50cd18e0bcad7 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:15:27 -0700
Subject: [PATCH 1/5] Copy filtering/subsampling guide into this repo

Future commits will modify the contents to fit this repo better.

Copied from <https://github.com/nextstrain/augur/blob/430a9766b551ae3c581450bc614bf2250e3d6c05/docs/usage/cli/snippets/filtering-and-subsampling.rst>
---
 .gitignore                                    |   1 -
 src/fetch-docs.py                             |   1 -
 .../filtering-and-subsampling.rst             | 208 +++++++++++++++++-
 3 files changed, 207 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 155988ba..a86f89b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,3 @@ src/guides/bioinformatics/translate_ref.md
 src/guides/bioinformatics/vcf_input.md
 src/guides/communicate/create-pdf.md
 src/guides/communicate/narratives-intro.md
-src/snippets/filtering-and-subsampling.rst
diff --git a/src/fetch-docs.py b/src/fetch-docs.py
index f5d78df1..985212b0 100755
--- a/src/fetch-docs.py
+++ b/src/fetch-docs.py
@@ -22,7 +22,6 @@
     f'{augur_url}faq/fasta_input.md': 'guides/bioinformatics/fasta_input.md',
     f'{augur_url}faq/seq_traits.md': 'guides/bioinformatics/seq_traits.md',
     f'{augur_url}examples/examples.rst': 'guides/bioinformatics/examples.rst',
-    f'{augur_url}usage/cli/snippets/filtering-and-subsampling.rst': 'snippets/filtering-and-subsampling.rst',
 }
 
 if __name__ == '__main__':
diff --git a/src/guides/bioinformatics/filtering-and-subsampling.rst b/src/guides/bioinformatics/filtering-and-subsampling.rst
index 4522e63a..22a63d1f 100644
--- a/src/guides/bioinformatics/filtering-and-subsampling.rst
+++ b/src/guides/bioinformatics/filtering-and-subsampling.rst
@@ -8,4 +8,210 @@ sample data.
 .. contents:: Table of Contents
    :local:
 
-.. include:: ../../snippets/filtering-and-subsampling.rst
+Filtering
+---------
+
+The filter command allows you to select various subsets of your input data for different types of analysis.
+A simple example use of this command would be
+
+.. code-block:: bash
+
+  augur filter \
+    --sequences data/sequences.fasta \
+    --metadata data/metadata.tsv \
+    --min-date 2012 \
+    --output-sequences filtered_sequences.fasta \
+    --output-metadata filtered_metadata.tsv
+
+This command will select all sequences with collection date in 2012 or later.
+The filter command has a large number of options that allow flexible filtering for many common situations.
+One such use-case is the exclusion of sequences that are known to be outliers (e.g. because of sequencing errors, cell-culture adaptation, ...).
+These can be specified in a separate text file (e.g. ``exclude.txt``):
+
+.. code-block::
+
+  BRA/2016/FC_DQ75D1
+  COL/FLR_00034/2015
+  ...
+
+To drop such strains, you can pass the filename to ``--exclude``:
+
+.. code-block:: bash
+
+  augur filter \
+    --sequences data/sequences.fasta \
+    --metadata data/metadata.tsv \
+    --min-date 2012 \
+    --exclude exclude.txt \
+    --output-sequences filtered_sequences.fasta \
+    --output-metadata filtered_metadata.tsv
+
+Subsampling within ``augur filter``
+-----------------------------------
+
+Another common filtering operation is subsetting of data to a achieve a more even spatio-temporal distribution or to cut-down data set size to more manageable numbers.
+The filter command allows you to select a specific number of sequences from specific groups, for example one sequence per month from each country:
+
+.. code-block:: bash
+
+  augur filter \
+    --sequences data/sequences.fasta \
+    --metadata data/metadata.tsv \
+    --min-date 2012 \
+    --exclude exclude.txt \
+    --group-by country year month \
+    --sequences-per-group 1 \
+    --output-sequences subsampled_sequences.fasta \
+    --output-metadata subsampled_metadata.tsv
+
+Subsampling using multiple ``augur filter`` commands
+----------------------------------------------------
+
+There are some subsampling strategies in which a single call to ``augur filter``
+does not suffice. One such strategy is "tiered subsampling". In this strategy,
+mutually exclusive sets of filters, each representing a "tier", are sampled with
+different subsampling rules. This is commonly used to create geographic tiers.
+Consider this subsampling scheme:
+
+    Sample 100 sequences from Washington state and 50 sequences from the rest of the United States.
+
+This cannot be done in a single call to ``augur filter``. Instead, it can be
+decomposed into multiple schemes, each handled by a single call to ``augur
+filter``. Additionally, there is an extra step to combine the intermediate
+samples.
+
+    1. Sample 100 sequences from Washington state.
+    2. Sample 50 sequences from the rest of the United States.
+    3. Combine the samples.
+
+Calling ``augur filter`` multiple times
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A basic approach is to run the ``augur filter`` commands directly. This works
+well for ad-hoc analyses.
+
+.. code-block:: bash
+
+  # 1. Sample 100 sequences from Washington state
+  augur filter \
+    --sequences sequences.fasta \
+    --metadata metadata.tsv \
+    --query "state == 'WA'" \
+    --subsample-max-sequences 100 \
+    --output-strains sample_strains_state.txt
+
+  # 2. Sample 50 sequences from the rest of the United States
+  augur filter \
+    --sequences sequences.fasta \
+    --metadata metadata.tsv \
+    --query "state != 'WA' & country == 'USA'" \
+    --subsample-max-sequences 50 \
+    --output-strains sample_strains_country.txt
+
+  # 3. Combine using augur filter
+  augur filter \
+    --sequences sequences.fasta \
+    --metadata metadata.tsv \
+    --exclude-all \
+    --include sample_strains_state.txt \
+              sample_strains_country.txt \
+    --output-sequences subsampled_sequences.fasta \
+    --output-metadata subsampled_metadata.tsv
+
+Each intermediate sample is represented by a strain list file obtained from
+``--output-strains``. The final step uses ``augur filter`` with ``--exclude-all``
+and ``--include`` to sample the data based on the intermediate strain list
+files. If the same strain appears in both files, ``augur filter`` will only
+write it once in each of the final outputs.
+
+Generalizing subsampling in a workflow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The approach above can be cumbersome with more intermediate samples. To
+generalize this process and allow for more flexibility, a workflow management
+system can be used. The following examples use `Snakemake`_.
+
+1. Add a section in the `config file`_.
+
+  .. code-block:: yaml
+
+    subsampling:
+      state: --query "state == 'WA'" --subsample-max-sequences 100
+      country: --query "state != 'WA' & country == 'USA'" --subsample-max-sequences 50
+
+2. Add two rules in a `Snakefile`_. If you are building a standard Nextstrain
+   workflow, the output files should be used as input to sequence alignment. See
+   :doc:`docs.nextstrain.org:learn/parts` to learn more about the placement of
+   this step within a workflow.
+
+  .. code-block:: python
+
+    # 1. Sample 100 sequences from Washington state
+    # 2. Sample 50 sequences from the rest of the United States
+    rule intermediate_sample:
+        input:
+            metadata = "data/metadata.tsv",
+        output:
+            strains = "results/sample_strains_{sample_name}.txt",
+        params:
+            augur_filter_args = lambda wildcards: config.get("subsampling", {}).get(wildcards.sample_name, "")
+        shell:
+            """
+            augur filter \
+                --metadata {input.metadata} \
+                {params.augur_filter_args} \
+                --output-strains {output.strains}
+            """
+
+    # 3. Combine using augur filter
+    rule combine_intermediate_samples:
+        input:
+            sequences = "data/sequences.fasta",
+            metadata = "data/metadata.tsv",
+            intermediate_sample_strains = expand("results/sample_strains_{sample_name}.txt", sample_name=list(config.get("subsampling", {}).keys()))
+        output:
+            sequences = "results/subsampled_sequences.fasta",
+            metadata = "results/subsampled_metadata.tsv",
+        shell:
+            """
+            augur filter \
+                --sequences {input.sequences} \
+                --metadata {input.metadata} \
+                --exclude-all \
+                --include {input.intermediate_sample_strains} \
+                --output-sequences {output.sequences} \
+                --output-metadata {output.metadata}
+            """
+
+3. Run Snakemake targeting the second rule.
+
+  .. code-block:: bash
+
+    snakemake combine_intermediate_samples
+
+Explanation:
+
+- The configuration section consists of one entry per intermediate sample in the
+  format ``sample_name: <augur filter arguments>``.
+- The first rule is run once per intermediate sample using `wildcards`_ and an
+  `input function`_. The output of each run is the sampled strain list.
+- The second rule uses `expand()`_ to define input as all the intermediate
+  sampled strain lists, which are passed directly to ``--include`` as done in
+  the previous example.
+
+It is easy to add or remove intermediate samples. The configuration above can be
+updated to add another tier in between state and country:
+
+  .. code-block:: yaml
+
+    subsampling:
+      state: --query "state == 'WA'" --subsample-max-sequences 100
+      neighboring_states: --query "state in {'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 75
+      country: --query "country == 'USA' & state not in {'WA', 'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 50
+
+.. _Snakemake: https://snakemake.readthedocs.io/en/stable/index.html
+.. _config file: https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#snakefiles-standard-configuration
+.. _Snakefile: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html
+.. _wildcards: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#wildcards
+.. _input function: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#snakefiles-input-functions
+.. _expand(): https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-expand-function

From 7a50339b3105500d6d6815d3dc75e9d4f4d30eb0 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:24:17 -0700
Subject: [PATCH 2/5] Use indent size of 3 spaces

For consistency with other files in this project.

Retain separate indent sizes for code blocks.
---
 .../filtering-and-subsampling.rst             | 203 +++++++++---------
 1 file changed, 101 insertions(+), 102 deletions(-)

diff --git a/src/guides/bioinformatics/filtering-and-subsampling.rst b/src/guides/bioinformatics/filtering-and-subsampling.rst
index 22a63d1f..6a8c47b6 100644
--- a/src/guides/bioinformatics/filtering-and-subsampling.rst
+++ b/src/guides/bioinformatics/filtering-and-subsampling.rst
@@ -16,12 +16,12 @@ A simple example use of this command would be
 
 .. code-block:: bash
 
-  augur filter \
-    --sequences data/sequences.fasta \
-    --metadata data/metadata.tsv \
-    --min-date 2012 \
-    --output-sequences filtered_sequences.fasta \
-    --output-metadata filtered_metadata.tsv
+   augur filter \
+     --sequences data/sequences.fasta \
+     --metadata data/metadata.tsv \
+     --min-date 2012 \
+     --output-sequences filtered_sequences.fasta \
+     --output-metadata filtered_metadata.tsv
 
 This command will select all sequences with collection date in 2012 or later.
 The filter command has a large number of options that allow flexible filtering for many common situations.
@@ -30,21 +30,21 @@ These can be specified in a separate text file (e.g. ``exclude.txt``):
 
 .. code-block::
 
-  BRA/2016/FC_DQ75D1
-  COL/FLR_00034/2015
-  ...
+   BRA/2016/FC_DQ75D1
+   COL/FLR_00034/2015
+   ...
 
 To drop such strains, you can pass the filename to ``--exclude``:
 
 .. code-block:: bash
 
-  augur filter \
-    --sequences data/sequences.fasta \
-    --metadata data/metadata.tsv \
-    --min-date 2012 \
-    --exclude exclude.txt \
-    --output-sequences filtered_sequences.fasta \
-    --output-metadata filtered_metadata.tsv
+   augur filter \
+     --sequences data/sequences.fasta \
+     --metadata data/metadata.tsv \
+     --min-date 2012 \
+     --exclude exclude.txt \
+     --output-sequences filtered_sequences.fasta \
+     --output-metadata filtered_metadata.tsv
 
 Subsampling within ``augur filter``
 -----------------------------------
@@ -54,15 +54,15 @@ The filter command allows you to select a specific number of sequences from spec
 
 .. code-block:: bash
 
-  augur filter \
-    --sequences data/sequences.fasta \
-    --metadata data/metadata.tsv \
-    --min-date 2012 \
-    --exclude exclude.txt \
-    --group-by country year month \
-    --sequences-per-group 1 \
-    --output-sequences subsampled_sequences.fasta \
-    --output-metadata subsampled_metadata.tsv
+   augur filter \
+     --sequences data/sequences.fasta \
+     --metadata data/metadata.tsv \
+     --min-date 2012 \
+     --exclude exclude.txt \
+     --group-by country year month \
+     --sequences-per-group 1 \
+     --output-sequences subsampled_sequences.fasta \
+     --output-metadata subsampled_metadata.tsv
 
 Subsampling using multiple ``augur filter`` commands
 ----------------------------------------------------
@@ -73,16 +73,16 @@ mutually exclusive sets of filters, each representing a "tier", are sampled with
 different subsampling rules. This is commonly used to create geographic tiers.
 Consider this subsampling scheme:
 
-    Sample 100 sequences from Washington state and 50 sequences from the rest of the United States.
+   Sample 100 sequences from Washington state and 50 sequences from the rest of the United States.
 
 This cannot be done in a single call to ``augur filter``. Instead, it can be
 decomposed into multiple schemes, each handled by a single call to ``augur
 filter``. Additionally, there is an extra step to combine the intermediate
 samples.
 
-    1. Sample 100 sequences from Washington state.
-    2. Sample 50 sequences from the rest of the United States.
-    3. Combine the samples.
+   1. Sample 100 sequences from Washington state.
+   2. Sample 50 sequences from the rest of the United States.
+   3. Combine the samples.
 
 Calling ``augur filter`` multiple times
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -92,31 +92,31 @@ well for ad-hoc analyses.
 
 .. code-block:: bash
 
-  # 1. Sample 100 sequences from Washington state
-  augur filter \
-    --sequences sequences.fasta \
-    --metadata metadata.tsv \
-    --query "state == 'WA'" \
-    --subsample-max-sequences 100 \
-    --output-strains sample_strains_state.txt
-
-  # 2. Sample 50 sequences from the rest of the United States
-  augur filter \
-    --sequences sequences.fasta \
-    --metadata metadata.tsv \
-    --query "state != 'WA' & country == 'USA'" \
-    --subsample-max-sequences 50 \
-    --output-strains sample_strains_country.txt
-
-  # 3. Combine using augur filter
-  augur filter \
-    --sequences sequences.fasta \
-    --metadata metadata.tsv \
-    --exclude-all \
-    --include sample_strains_state.txt \
-              sample_strains_country.txt \
-    --output-sequences subsampled_sequences.fasta \
-    --output-metadata subsampled_metadata.tsv
+   # 1. Sample 100 sequences from Washington state
+   augur filter \
+     --sequences sequences.fasta \
+     --metadata metadata.tsv \
+     --query "state == 'WA'" \
+     --subsample-max-sequences 100 \
+     --output-strains sample_strains_state.txt
+ 
+   # 2. Sample 50 sequences from the rest of the United States
+   augur filter \
+     --sequences sequences.fasta \
+     --metadata metadata.tsv \
+     --query "state != 'WA' & country == 'USA'" \
+     --subsample-max-sequences 50 \
+     --output-strains sample_strains_country.txt
+ 
+   # 3. Combine using augur filter
+   augur filter \
+     --sequences sequences.fasta \
+     --metadata metadata.tsv \
+     --exclude-all \
+     --include sample_strains_state.txt \
+               sample_strains_country.txt \
+     --output-sequences subsampled_sequences.fasta \
+     --output-metadata subsampled_metadata.tsv
 
 Each intermediate sample is represented by a strain list file obtained from
 ``--output-strains``. The final step uses ``augur filter`` with ``--exclude-all``
@@ -133,61 +133,60 @@ system can be used. The following examples use `Snakemake`_.
 
 1. Add a section in the `config file`_.
 
-  .. code-block:: yaml
+   .. code-block:: yaml
 
-    subsampling:
-      state: --query "state == 'WA'" --subsample-max-sequences 100
-      country: --query "state != 'WA' & country == 'USA'" --subsample-max-sequences 50
+      subsampling:
+        state: --query "state == 'WA'" --subsample-max-sequences 100
+        country: --query "state != 'WA' & country == 'USA'" --subsample-max-sequences 50
 
 2. Add two rules in a `Snakefile`_. If you are building a standard Nextstrain
    workflow, the output files should be used as input to sequence alignment. See
    :doc:`docs.nextstrain.org:learn/parts` to learn more about the placement of
    this step within a workflow.
 
-  .. code-block:: python
-
-    # 1. Sample 100 sequences from Washington state
-    # 2. Sample 50 sequences from the rest of the United States
-    rule intermediate_sample:
-        input:
-            metadata = "data/metadata.tsv",
-        output:
-            strains = "results/sample_strains_{sample_name}.txt",
-        params:
-            augur_filter_args = lambda wildcards: config.get("subsampling", {}).get(wildcards.sample_name, "")
-        shell:
-            """
-            augur filter \
-                --metadata {input.metadata} \
-                {params.augur_filter_args} \
-                --output-strains {output.strains}
-            """
-
-    # 3. Combine using augur filter
-    rule combine_intermediate_samples:
-        input:
-            sequences = "data/sequences.fasta",
-            metadata = "data/metadata.tsv",
-            intermediate_sample_strains = expand("results/sample_strains_{sample_name}.txt", sample_name=list(config.get("subsampling", {}).keys()))
-        output:
-            sequences = "results/subsampled_sequences.fasta",
-            metadata = "results/subsampled_metadata.tsv",
-        shell:
-            """
-            augur filter \
-                --sequences {input.sequences} \
-                --metadata {input.metadata} \
-                --exclude-all \
-                --include {input.intermediate_sample_strains} \
-                --output-sequences {output.sequences} \
-                --output-metadata {output.metadata}
-            """
+   .. code-block:: python
+
+      # 1. Sample 100 sequences from Washington state
+      # 2. Sample 50 sequences from the rest of the United States
+      rule intermediate_sample:
+          input:
+              metadata = "data/metadata.tsv",
+          output:
+              strains = "results/sample_strains_{sample_name}.txt",
+          params:
+              augur_filter_args = lambda wildcards: config.get("subsampling", {}).get(wildcards.sample_name, "")
+          shell:
+              """
+              augur filter \
+                  --metadata {input.metadata} \
+                  {params.augur_filter_args} \
+                  --output-strains {output.strains}
+              """
+      # 3. Combine using augur filter
+      rule combine_intermediate_samples:
+          input:
+              sequences = "data/sequences.fasta",
+              metadata = "data/metadata.tsv",
+              intermediate_sample_strains = expand("results/sample_strains_{sample_name}.txt", sample_name=list(config.get("subsampling", {}).keys()))
+          output:
+              sequences = "results/subsampled_sequences.fasta",
+              metadata = "results/subsampled_metadata.tsv",
+          shell:
+              """
+              augur filter \
+                  --sequences {input.sequences} \
+                  --metadata {input.metadata} \
+                  --exclude-all \
+                  --include {input.intermediate_sample_strains} \
+                  --output-sequences {output.sequences} \
+                  --output-metadata {output.metadata}
+              """
 
 3. Run Snakemake targeting the second rule.
 
-  .. code-block:: bash
+   .. code-block:: bash
 
-    snakemake combine_intermediate_samples
+      snakemake combine_intermediate_samples
 
 Explanation:
 
@@ -204,10 +203,10 @@ updated to add another tier in between state and country:
 
   .. code-block:: yaml
 
-    subsampling:
-      state: --query "state == 'WA'" --subsample-max-sequences 100
-      neighboring_states: --query "state in {'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 75
-      country: --query "country == 'USA' & state not in {'WA', 'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 50
+   subsampling:
+     state: --query "state == 'WA'" --subsample-max-sequences 100
+     neighboring_states: --query "state in {'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 75
+     country: --query "country == 'USA' & state not in {'WA', 'CA', 'ID', 'OR', 'NV'}" --subsample-max-sequences 50
 
 .. _Snakemake: https://snakemake.readthedocs.io/en/stable/index.html
 .. _config file: https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#snakefiles-standard-configuration

From a21e2cd6644a43e229e6f4c61afaad113e951769 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:25:53 -0700
Subject: [PATCH 3/5] Word wrap at 80 characters

For consistency within this file.
---
 .../filtering-and-subsampling.rst             | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/guides/bioinformatics/filtering-and-subsampling.rst b/src/guides/bioinformatics/filtering-and-subsampling.rst
index 6a8c47b6..751fa56e 100644
--- a/src/guides/bioinformatics/filtering-and-subsampling.rst
+++ b/src/guides/bioinformatics/filtering-and-subsampling.rst
@@ -11,8 +11,8 @@ sample data.
 Filtering
 ---------
 
-The filter command allows you to select various subsets of your input data for different types of analysis.
-A simple example use of this command would be
+The filter command allows you to select various subsets of your input data for
+different types of analysis. A simple example use of this command would be
 
 .. code-block:: bash
 
@@ -24,9 +24,11 @@ A simple example use of this command would be
      --output-metadata filtered_metadata.tsv
 
 This command will select all sequences with collection date in 2012 or later.
-The filter command has a large number of options that allow flexible filtering for many common situations.
-One such use-case is the exclusion of sequences that are known to be outliers (e.g. because of sequencing errors, cell-culture adaptation, ...).
-These can be specified in a separate text file (e.g. ``exclude.txt``):
+The filter command has a large number of options that allow flexible filtering
+for many common situations. One such use-case is the exclusion of sequences that
+are known to be outliers (e.g. because of sequencing errors, cell-culture
+adaptation, ...). These can be specified in a separate text file (e.g.
+``exclude.txt``):
 
 .. code-block::
 
@@ -49,8 +51,11 @@ To drop such strains, you can pass the filename to ``--exclude``:
 Subsampling within ``augur filter``
 -----------------------------------
 
-Another common filtering operation is subsetting of data to a achieve a more even spatio-temporal distribution or to cut-down data set size to more manageable numbers.
-The filter command allows you to select a specific number of sequences from specific groups, for example one sequence per month from each country:
+Another common filtering operation is subsetting of data to a achieve a more
+even spatio-temporal distribution or to cut-down data set size to more
+manageable numbers. The filter command allows you to select a specific number of
+sequences from specific groups, for example one sequence per month from each
+country:
 
 .. code-block:: bash
 

From 5790eba38d282b5feaa3da53206baa571ad7f16d Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:27:52 -0700
Subject: [PATCH 4/5] Update heading punctuation character

For consistency with other files in this project.
---
 .../bioinformatics/filtering-and-subsampling.rst       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/guides/bioinformatics/filtering-and-subsampling.rst b/src/guides/bioinformatics/filtering-and-subsampling.rst
index 751fa56e..9eed64a8 100644
--- a/src/guides/bioinformatics/filtering-and-subsampling.rst
+++ b/src/guides/bioinformatics/filtering-and-subsampling.rst
@@ -9,7 +9,7 @@ sample data.
    :local:
 
 Filtering
----------
+=========
 
 The filter command allows you to select various subsets of your input data for
 different types of analysis. A simple example use of this command would be
@@ -49,7 +49,7 @@ To drop such strains, you can pass the filename to ``--exclude``:
      --output-metadata filtered_metadata.tsv
 
 Subsampling within ``augur filter``
------------------------------------
+===================================
 
 Another common filtering operation is subsetting of data to a achieve a more
 even spatio-temporal distribution or to cut-down data set size to more
@@ -70,7 +70,7 @@ country:
      --output-metadata subsampled_metadata.tsv
 
 Subsampling using multiple ``augur filter`` commands
-----------------------------------------------------
+====================================================
 
 There are some subsampling strategies in which a single call to ``augur filter``
 does not suffice. One such strategy is "tiered subsampling". In this strategy,
@@ -90,7 +90,7 @@ samples.
    3. Combine the samples.
 
 Calling ``augur filter`` multiple times
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------------
 
 A basic approach is to run the ``augur filter`` commands directly. This works
 well for ad-hoc analyses.
@@ -130,7 +130,7 @@ files. If the same strain appears in both files, ``augur filter`` will only
 write it once in each of the final outputs.
 
 Generalizing subsampling in a workflow
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------
 
 The approach above can be cumbersome with more intermediate samples. To
 generalize this process and allow for more flexibility, a workflow management

From 1f77c01c3793904235f4929b0564ecb4536a8a69 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:30:23 -0700
Subject: [PATCH 5/5] Reference internal page directly

The external reference is no longer necessary now that the contents are
in this project.
---
 src/conf.py                                             | 2 --
 src/guides/bioinformatics/filtering-and-subsampling.rst | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/conf.py b/src/conf.py
index 25ef461d..15b73f5f 100644
--- a/src/conf.py
+++ b/src/conf.py
@@ -82,8 +82,6 @@
     'augur': ('https://docs.nextstrain.org/projects/augur/page/', None),
     'auspice': ('https://docs.nextstrain.org/projects/auspice/page/', None),
     'cli': ('https://docs.nextstrain.org/projects/cli/page/', None),
-    # For externally sourced snippets that reference this docs project
-    'docs.nextstrain.org': ('https://docs.nextstrain.org/page/', None),
     'nextclade': ('https://docs.nextstrain.org/projects/nextclade/page/', None),
     'ncov': ('https://docs.nextstrain.org/projects/ncov/page/', None)
 }
diff --git a/src/guides/bioinformatics/filtering-and-subsampling.rst b/src/guides/bioinformatics/filtering-and-subsampling.rst
index 9eed64a8..f6fa8f77 100644
--- a/src/guides/bioinformatics/filtering-and-subsampling.rst
+++ b/src/guides/bioinformatics/filtering-and-subsampling.rst
@@ -146,7 +146,7 @@ system can be used. The following examples use `Snakemake`_.
 
 2. Add two rules in a `Snakefile`_. If you are building a standard Nextstrain
    workflow, the output files should be used as input to sequence alignment. See
-   :doc:`docs.nextstrain.org:learn/parts` to learn more about the placement of
+   :doc:`../../learn/parts` to learn more about the placement of
    this step within a workflow.
 
    .. code-block:: python