From 878dc1f0ab861ff979332fcae7dda0bbc0e16ac7 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 23 Feb 2024 12:51:00 -0800 Subject: [PATCH 1/5] Add test showing current behavior --- .../frequencies/cram/diffusion-region.t | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/functional/frequencies/cram/diffusion-region.t diff --git a/tests/functional/frequencies/cram/diffusion-region.t b/tests/functional/frequencies/cram/diffusion-region.t new file mode 100644 index 000000000..4a7ff224e --- /dev/null +++ b/tests/functional/frequencies/cram/diffusion-region.t @@ -0,0 +1,15 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Calculate diffusion-based tip frequencies from a refined tree with `--regions`. +This currently does not work due to a bug. + + $ ${AUGUR} frequencies \ + > --method diffusion \ + > --tree "$TESTDIR/../data/tree.nwk" \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --regions "global" "North America" "South America" \ + > --pivot-interval 3 \ + > --output tip-frequencies.json > /dev/null 2>&1 + [2] From 696a9ca1cddc7675485ea66a5a881dfa2fa15ff7 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 23 Feb 2024 11:30:34 -0800 Subject: [PATCH 2/5] frequencies: use "region" column for `--regions` Fixes the use of the `--regions` flag with values other than the default "global" region by: 1. including the "region" column from the metadata file, similar to how the weight attribute columns are included in dce03745000d47634e30786177a34050a43682cd. 2. annotating the tips with the "region" values from the metadata file, similar to how the weight attributes are annotated in dde857a23dffdfa9d841ae8d245d77194bd741de. --- augur/frequencies.py | 10 +- .../frequencies/cram/diffusion-region.t | 238 +++++++++++++++++- 2 files changed, 244 insertions(+), 4 deletions(-) diff --git a/augur/frequencies.py b/augur/frequencies.py index 48f2b98fd..3ac879a44 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -14,6 +14,7 @@ from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN, InvalidDelimiter, Metadata, read_metadata from .utils import write_json +REGION_COLUMN = 'region' def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("frequencies", help=__doc__) @@ -97,6 +98,10 @@ def run(args): columns_to_load = [metadata_object.id_column, METADATA_DATE_COLUMN] if args.weights_attribute: columns_to_load.append(args.weights_attribute) + + if args.regions: + columns_to_load.append(REGION_COLUMN) + metadata = read_metadata( args.metadata, delimiters=[metadata_object.delimiter], @@ -130,6 +135,9 @@ def run(args): # Annotate tip with weight attribute. tip.attr[weights_attribute] = metadata.loc[tip.name, weights_attribute] + if args.regions: + tip.attr[REGION_COLUMN] = metadata.loc[tip.name, REGION_COLUMN] + if args.method == "diffusion": # estimate tree frequencies pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date, args.pivot_interval_units) @@ -142,7 +150,7 @@ def run(args): if region=='global': node_filter_func = lambda node: node.attr["num_date"] >= pivots[0] else: - node_filter_func = lambda node: (node.attr["region"] == region + node_filter_func = lambda node: (node.attr[REGION_COLUMN] == region and node.attr["num_date"] >= pivots[0]) tree_freqs = tree_frequencies(tree, pivots, method='SLSQP', diff --git a/tests/functional/frequencies/cram/diffusion-region.t b/tests/functional/frequencies/cram/diffusion-region.t index 4a7ff224e..6ee1285c5 100644 --- a/tests/functional/frequencies/cram/diffusion-region.t +++ b/tests/functional/frequencies/cram/diffusion-region.t @@ -3,7 +3,6 @@ Setup $ source "$TESTDIR"/_setup.sh Calculate diffusion-based tip frequencies from a refined tree with `--regions`. -This currently does not work due to a bug. $ ${AUGUR} frequencies \ > --method diffusion \ @@ -11,5 +10,238 @@ This currently does not work due to a bug. > --metadata "$TESTDIR/../data/metadata.tsv" \ > --regions "global" "North America" "South America" \ > --pivot-interval 3 \ - > --output tip-frequencies.json > /dev/null 2>&1 - [2] + > --output tip-frequencies.json > /dev/null + + $ cat tip-frequencies.json + { + "BRA/2016/FC_6706": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "COL/FLR_00008/2015": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "Colombia/2016/ZC204Se": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "DOM/2016/BB_0183": { + "North America": [ + 0.25, + 0.25, + 0.25, + 0.25 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "EcEs062_16": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "HND/2016/HU_ME59": { + "North America": [ + 0.25, + 0.25, + 0.25, + 0.25 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "PAN/CDC_259359_V1_V3/2015": { + "North America": [ + 0.25, + 0.25, + 0.25, + 0.25 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "PRVABC59": { + "North America": [ + 0.25, + 0.25, + 0.25, + 0.25 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "VEN/UF_1/2016": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.2, + 0.2, + 0.2, + 0.2 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "ZKC2/2016": { + "North America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "South America": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "global": [ + 0.1, + 0.1, + 0.1, + 0.1 + ] + }, + "counts": { + "North America": [ + 0, + 2, + 2, + 0 + ], + "South America": [ + 0, + 2, + 3, + 0 + ], + "global": [ + 0, + 5, + 5, + 0 + ] + }, + "generated_by": { + "program": "augur", + "version": ".*" (re) + }, + "pivots": [ + 2015.7521, + 2016.0041, + 2016.2527, + 2016.5014 + ] + } (no-eol) From d47999ba9f162c6f7137e9f4b7d13267f98df8a7 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 23 Feb 2024 13:02:10 -0800 Subject: [PATCH 3/5] frequencies: only use "region" column for non-global regions The "region" column is only required when specifying regions that are not the default "global" region. --- augur/frequencies.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/augur/frequencies.py b/augur/frequencies.py index 3ac879a44..bdbd031b2 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -15,6 +15,7 @@ from .utils import write_json REGION_COLUMN = 'region' +DEFAULT_REGION = 'global' def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("frequencies", help=__doc__) @@ -27,7 +28,7 @@ def register_parser(parent_subparsers): help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") - parser.add_argument('--regions', type=str, nargs='+', default=['global'], + parser.add_argument('--regions', type=str, nargs='+', default=[DEFAULT_REGION], help="region to subsample to") parser.add_argument("--pivot-interval", type=int, default=3, help="number of units between pivots") @@ -99,7 +100,8 @@ def run(args): if args.weights_attribute: columns_to_load.append(args.weights_attribute) - if args.regions: + filter_to_region = any(region != DEFAULT_REGION for region in args.regions) + if filter_to_region: columns_to_load.append(REGION_COLUMN) metadata = read_metadata( @@ -135,7 +137,7 @@ def run(args): # Annotate tip with weight attribute. tip.attr[weights_attribute] = metadata.loc[tip.name, weights_attribute] - if args.regions: + if filter_to_region: tip.attr[REGION_COLUMN] = metadata.loc[tip.name, REGION_COLUMN] if args.method == "diffusion": @@ -147,7 +149,7 @@ def run(args): for region in args.regions: # Omit strains sampled prior to the first pivot from frequency calculations. # (these tend to be reference strains included for phylogenetic context) - if region=='global': + if region==DEFAULT_REGION: node_filter_func = lambda node: node.attr["num_date"] >= pivots[0] else: node_filter_func = lambda node: (node.attr[REGION_COLUMN] == region From 56bf16a86b9413c067f7fae4d07b211a677b9a7b Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 23 Feb 2024 13:02:44 -0800 Subject: [PATCH 4/5] frequencies: update `--region` help text Update help text to reflect the use of the "region" column and values from the metadata file. --- augur/frequencies.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/augur/frequencies.py b/augur/frequencies.py index bdbd031b2..75a11ee3c 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -29,7 +29,9 @@ def register_parser(parent_subparsers): parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--regions', type=str, nargs='+', default=[DEFAULT_REGION], - help="region to subsample to") + help="region to filter to. " \ + f"Regions should match values in the {REGION_COLUMN!r} column of the metadata file " \ + f"if specifying values other than the default {DEFAULT_REGION!r} region.") parser.add_argument("--pivot-interval", type=int, default=3, help="number of units between pivots") parser.add_argument("--pivot-interval-units", type=str, default="months", choices=['months', 'weeks'], From 455bcfab3a46c41ed69de0d9921967ce63d11d1a Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 23 Feb 2024 13:11:16 -0800 Subject: [PATCH 5/5] Update changelog --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 537d256ca..566d2ca26 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,8 +5,10 @@ ### Bug Fixes * filter: Updated the help and report text of `--min-length` to explicitly state that the minimum length filter only counts standard nucleotide characters A, C, G, or T (case-insensitive). This has been the behavior since version 3.0.3.dev1, but has never been explicitly documented. [#1422][] (@joverlee521) +* frequencies: Fixed a bug introduced in 24.2.0 and 24.1.0 that prevented `--regions` from working when providing regions other than the default "global" region. [#1424] [#1422]: https://github.com/nextstrain/augur/pull/1422 +[#1424]: https://github.com/nextstrain/augur/pull/1424 ## 24.2.2 (16 February 2024)