nextstrain · joverlee521 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,8 +5,10 @@
 ### Bug Fixes
 
 * filter: Updated the help and report text of `--min-length` to explicitly state that the minimum length filter only counts standard nucleotide characters A, C, G, or T (case-insensitive). This has been the behavior since version 3.0.3.dev1, but has never been explicitly documented. [#1422][] (@joverlee521)
+* frequencies: Fixed a bug introduced in 24.2.0 and 24.1.0 that prevented `--regions` from working when providing regions other than the default "global" region. [#1424]
 
 [#1422]: https://github.com/nextstrain/augur/pull/1422
+[#1424]: https://github.com/nextstrain/augur/pull/1424
 
 ## 24.2.2 (16 February 2024)
 

diff --git a/augur/frequencies.py b/augur/frequencies.py
@@ -14,6 +14,8 @@
 from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN, InvalidDelimiter, Metadata, read_metadata
 from .utils import write_json
 
+REGION_COLUMN = 'region'
+DEFAULT_REGION = 'global'
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("frequencies", help=__doc__)
@@ -26,8 +28,10 @@ def register_parser(parent_subparsers):
                         help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
     parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
                         help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
-    parser.add_argument('--regions', type=str, nargs='+', default=['global'],
-                        help="region to subsample to")
+    parser.add_argument('--regions', type=str, nargs='+', default=[DEFAULT_REGION],
+                        help="region to filter to. " \
+                            f"Regions should match values in the {REGION_COLUMN!r} column of the metadata file " \
+                            f"if specifying values other than the default {DEFAULT_REGION!r} region.")
     parser.add_argument("--pivot-interval", type=int, default=3,
                         help="number of units between pivots")
     parser.add_argument("--pivot-interval-units", type=str, default="months", choices=['months', 'weeks'],
@@ -97,6 +101,11 @@ def run(args):
     columns_to_load = [metadata_object.id_column, METADATA_DATE_COLUMN]
     if args.weights_attribute:
         columns_to_load.append(args.weights_attribute)
+
+    filter_to_region = any(region != DEFAULT_REGION for region in args.regions)
+    if filter_to_region:
+        columns_to_load.append(REGION_COLUMN)
+
     metadata = read_metadata(
         args.metadata,
         delimiters=[metadata_object.delimiter],
@@ -130,6 +139,9 @@ def run(args):
                 # Annotate tip with weight attribute.
                 tip.attr[weights_attribute] = metadata.loc[tip.name, weights_attribute]
 
+            if filter_to_region:
+                tip.attr[REGION_COLUMN] = metadata.loc[tip.name, REGION_COLUMN]
+
         if args.method == "diffusion":
             # estimate tree frequencies
             pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date, args.pivot_interval_units)
@@ -139,10 +151,10 @@ def run(args):
             for region in args.regions:
                 # Omit strains sampled prior to the first pivot from frequency calculations.
                 # (these tend to be reference strains included for phylogenetic context)
-                if region=='global':
+                if region==DEFAULT_REGION:
                     node_filter_func = lambda node: node.attr["num_date"] >= pivots[0]
                 else:
-                    node_filter_func = lambda node: (node.attr["region"] == region
+                    node_filter_func = lambda node: (node.attr[REGION_COLUMN] == region
                                                     and node.attr["num_date"] >= pivots[0])
 
                 tree_freqs = tree_frequencies(tree, pivots, method='SLSQP',

diff --git a/tests/functional/frequencies/cram/diffusion-region.t b/tests/functional/frequencies/cram/diffusion-region.t
@@ -0,0 +1,247 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Calculate diffusion-based tip frequencies from a refined tree with `--regions`.
+
+  $ ${AUGUR} frequencies \
+  >  --method diffusion \
+  >  --tree "$TESTDIR/../data/tree.nwk" \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --regions "global" "North America" "South America" \
+  >  --pivot-interval 3 \
+  >  --output tip-frequencies.json > /dev/null
+
+  $ cat tip-frequencies.json
+  {
+    "BRA/2016/FC_6706": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "COL/FLR_00008/2015": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "Colombia/2016/ZC204Se": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "DOM/2016/BB_0183": {
+      "North America": [
+        0.25,
+        0.25,
+        0.25,
+        0.25
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "EcEs062_16": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "HND/2016/HU_ME59": {
+      "North America": [
+        0.25,
+        0.25,
+        0.25,
+        0.25
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "PAN/CDC_259359_V1_V3/2015": {
+      "North America": [
+        0.25,
+        0.25,
+        0.25,
+        0.25
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "PRVABC59": {
+      "North America": [
+        0.25,
+        0.25,
+        0.25,
+        0.25
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "VEN/UF_1/2016": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.2,
+        0.2,
+        0.2,
+        0.2
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "ZKC2/2016": {
+      "North America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "South America": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "global": [
+        0.1,
+        0.1,
+        0.1,
+        0.1
+      ]
+    },
+    "counts": {
+      "North America": [
+        0,
+        2,
+        2,
+        0
+      ],
+      "South America": [
+        0,
+        2,
+        3,
+        0
+      ],
+      "global": [
+        0,
+        5,
+        5,
+        0
+      ]
+    },
+    "generated_by": {
+      "program": "augur",
+      "version": ".*" (re)
+    },
+    "pivots": [
+      2015.7521,
+      2016.0041,
+      2016.2527,
+      2016.5014
+    ]
+  } (no-eol)