diff --git a/CHANGES.md b/CHANGES.md index 537d256ca..566d2ca26 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,8 +5,10 @@ ### Bug Fixes * filter: Updated the help and report text of `--min-length` to explicitly state that the minimum length filter only counts standard nucleotide characters A, C, G, or T (case-insensitive). This has been the behavior since version 3.0.3.dev1, but has never been explicitly documented. [#1422][] (@joverlee521) +* frequencies: Fixed a bug introduced in 24.2.0 and 24.1.0 that prevented `--regions` from working when providing regions other than the default "global" region. [#1424] [#1422]: https://github.com/nextstrain/augur/pull/1422 +[#1424]: https://github.com/nextstrain/augur/pull/1424 ## 24.2.2 (16 February 2024) diff --git a/augur/frequencies.py b/augur/frequencies.py index 48f2b98fd..75a11ee3c 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -14,6 +14,8 @@ from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN, InvalidDelimiter, Metadata, read_metadata from .utils import write_json +REGION_COLUMN = 'region' +DEFAULT_REGION = 'global' def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("frequencies", help=__doc__) @@ -26,8 +28,10 @@ def register_parser(parent_subparsers): help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") - parser.add_argument('--regions', type=str, nargs='+', default=['global'], - help="region to subsample to") + parser.add_argument('--regions', type=str, nargs='+', default=[DEFAULT_REGION], + help="region to filter to. " \ + f"Regions should match values in the {REGION_COLUMN!r} column of the metadata file " \ + f"if specifying values other than the default {DEFAULT_REGION!r} region.") parser.add_argument("--pivot-interval", type=int, default=3, help="number of units between pivots") parser.add_argument("--pivot-interval-units", type=str, default="months", choices=['months', 'weeks'], @@ -97,6 +101,11 @@ def run(args): columns_to_load = [metadata_object.id_column, METADATA_DATE_COLUMN] if args.weights_attribute: columns_to_load.append(args.weights_attribute) + + filter_to_region = any(region != DEFAULT_REGION for region in args.regions) + if filter_to_region: + columns_to_load.append(REGION_COLUMN) + metadata = read_metadata( args.metadata, delimiters=[metadata_object.delimiter], @@ -130,6 +139,9 @@ def run(args): # Annotate tip with weight attribute. tip.attr[weights_attribute] = metadata.loc[tip.name, weights_attribute] + if filter_to_region: + tip.attr[REGION_COLUMN] = metadata.loc[tip.name, REGION_COLUMN] + if args.method == "diffusion": # estimate tree frequencies pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date, args.pivot_interval_units) @@ -139,10 +151,10 @@ def run(args): for region in args.regions: # Omit strains sampled prior to the first pivot from frequency calculations. # (these tend to be reference strains included for phylogenetic context) - if region=='global': + if region==DEFAULT_REGION: node_filter_func = lambda node: node.attr["num_date"] >= pivots[0] else: - node_filter_func = lambda node: (node.attr["region"] == region + node_filter_func = lambda node: (node.attr[REGION_COLUMN] == region and node.attr["num_date"] >= pivots[0]) tree_freqs = tree_frequencies(tree, pivots, method='SLSQP', diff --git a/tests/functional/frequencies/cram/diffusion-region.t b/tests/functional/frequencies/cram/diffusion-region.t new file mode 100644 index 000000000..6ee1285c5 --- /dev/null +++ b/tests/functional/frequencies/cram/diffusion-region.t @@ -0,0 +1,247 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Calculate diffusion-based tip frequencies from a refined tree with `--regions`. + + $ ${AUGUR} frequencies \ + > --method diffusion \ + > --tree "$TESTDIR/../data/tree.nwk" \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --regions "global" "North America" "South America" \ + > --pivot-interval 3 \ + > --output tip-frequencies.json > /dev/null + + $ cat tip-frequencies.json + { + "BRA/2016/FC_6706": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "COL/FLR_00008/2015": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "Colombia/2016/ZC204Se": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "DOM/2016/BB_0183": { + "North America": [ + 0.25, + 0.25, + 0.25, + 0.25 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "EcEs062_16": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "HND/2016/HU_ME59": { + "North America": [ + 0.25, + 0.25, + 0.25, + 0.25 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "PAN/CDC_259359_V1_V3/2015": { + "North America": [ + 0.25, + 0.25, + 0.25, + 0.25 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "PRVABC59": { + "North America": [ + 0.25, + 0.25, + 0.25, + 0.25 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "VEN/UF_1/2016": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "ZKC2/2016": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "counts": { + "North America": [ + 0, + 2, + 2, + 0 + ], + "South America": [ + 0, + 2, + 3, + 0 + ], + "global": [ + 0, + 5, + 5, + 0 + ] + }, + "generated_by": { + "program": "augur", + "version": ".*" (re) + }, + "pivots": [ + 2015.7521, + 2016.0041, + 2016.2527, + 2016.5014 + ] + } (no-eol)