From 89156a78cbaa8719529353f4357127dec119e6c9 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 15 Mar 2022 12:10:28 -0700 Subject: [PATCH 01/21] Add "measurements" to list of valid panels Updates lists of valid panels in Auspice config JSON schema, Auspice JSON v2 schema, and the export v2 command's argparse choices. These changes allow users to add "measurements" to the list of panels in an Auspice config JSON or the command line. --- augur/data/schema-auspice-config-v2.json | 2 +- augur/data/schema-export-v2.json | 2 +- augur/export_v2.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/augur/data/schema-auspice-config-v2.json b/augur/data/schema-auspice-config-v2.json index 422b5f197..3a0ef5795 100644 --- a/augur/data/schema-auspice-config-v2.json +++ b/augur/data/schema-auspice-config-v2.json @@ -223,7 +223,7 @@ "minItems": 1, "items": { "type": "string", - "enum": ["tree", "map", "frequencies", "entropy"] + "enum": ["tree", "map", "frequencies", "entropy", "measurements"] } }, "vaccine_choices": { diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json index 311ec8cc4..531281feb 100644 --- a/augur/data/schema-export-v2.json +++ b/augur/data/schema-export-v2.json @@ -103,7 +103,7 @@ "type": "array", "items": { "type": "string", - "enum": ["tree", "map", "frequencies", "entropy"] + "enum": ["tree", "map", "frequencies", "entropy", "measurements"] }, "uniqueItems": true, "minItems": 1 diff --git a/augur/export_v2.py b/augur/export_v2.py index b343a3a15..baa402215 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -842,7 +842,7 @@ def register_arguments_v2(subparsers): config.add_argument('--description', metavar="description.md", help="Markdown file with description of build and/or acknowledgements to be displayed by Auspice") config.add_argument('--geo-resolutions', metavar="trait", nargs='+', help="Geographic traits to be displayed on map") config.add_argument('--color-by-metadata', metavar="trait", nargs='+', help="Metadata columns to include as coloring options") - config.add_argument('--panels', metavar="panels", nargs='+', choices=['tree', 'map', 'entropy', 'frequencies'], help="Restrict panel display in auspice. Options are %(choices)s. Ignore this option to display all available panels.") + config.add_argument('--panels', metavar="panels", nargs='+', choices=['tree', 'map', 'entropy', 'frequencies', 'measurements'], help="Restrict panel display in auspice. Options are %(choices)s. Ignore this option to display all available panels.") optional_inputs = v2.add_argument_group( title="OPTIONAL INPUT FILES" From ee56099123623ee8f6c23d909a8ce1c8009cf844 Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 24 Mar 2022 13:55:19 -0700 Subject: [PATCH 02/21] Add JSON schema for measurements collection config A schema for the measurements collection config JSON to be supplied to `augur measurements export`. Basically a copy of the config properties for a collection within the measurements schema. In the future, we could look into using `jsonschema.RefResolver` to use refs that refer to a separate schema file. --- ...schema-measurements-collection-config.json | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 augur/data/schema-measurements-collection-config.json diff --git a/augur/data/schema-measurements-collection-config.json b/augur/data/schema-measurements-collection-config.json new file mode 100644 index 000000000..f0171cbb2 --- /dev/null +++ b/augur/data/schema-measurements-collection-config.json @@ -0,0 +1,107 @@ +{ + "$schema": "http://json-schema.org/draft-06/schema#", + "$id": "https://nextstrain.org/schemas/dataset/measurements/collection/config", + "title": "Collection config file to be supplied to `augur measurements export`", + "type": "object", + "additionalProperties": false, + "required": [], + "properties": { + "key": { + "description": "The short name of the collection that is only used internally within Auspice. Each collection is expected to have a unique key.", + "type": "string" + }, + "title": { + "description": "The title to display in the collections dropdown and panel title. Optional -- if not provided, then `key` will be used", + "type": "string" + }, + "fields": { + "description": "Custom field order and display titles. Order of the fields determines the order they are shown in the measurement hover display. Optional -- if not provided, then the key of the field will be displayed in alphabetical order.", + "type": "array", + "minItems": 1, + "items": { + "description": "A single field of the measurements", + "type": "object", + "additionalProperties": false, + "required": ["key"], + "properties": { + "key": { + "description": "The property name of the field within the measurement object", + "type": "string" + }, + "title": { + "description": "The display title for the field. Optional -- if not provided, then `key` will be used", + "type": "string" + } + } + } + }, + "groupings": { + "description": "The available group by fields for measurements. Order of the group by fields determines the order they are shown in the group by dropdown.", + "type": "array", + "minItems": 1, + "items": { + "description": "A single group by field for measurements", + "type": "object", + "additionalProperties": false, + "required": ["key"], + "properties": { + "key": { + "description": "The property name of the group by field within the measurement object", + "type": "string" + }, + "order": { + "description": "A custom order of group by values to customize the display order of the subplots when using group by field", + "type": "array", + "minItems": 1, + "items": { + "description": "A single value of the group by field present in measurements", + "type": ["string", "number", "boolean"] + } + } + } + } + }, + "filters": { + "description": "The available filter options for measurements. Order of the filter options determines the order they are shown in the filter dropdown. Optional -- if not provided, then all fields will be available as filters.", + "type": "array", + "minItems": 1, + "items": { + "description": "The property name of the filter field within the measurement object", + "type": "string" + } + }, + "x_axis_label": { + "description": "The short label to display for the x-axis that describes the `value` of the measurements in a collection", + "type": "string" + }, + "threshold": { + "description": "A numeric measurement threshold to be displayed as a single grey line shared across subplots. Optional -- if not provided, no threshold will be displayed", + "type": "number" + }, + "display_defaults": { + "description": "Default display options of the collection", + "type": "object", + "additionalProperties": false, + "minProperty": 1, + "properties": { + "group_by": { + "description": "Default group by field name that must be included as a group by option in the groupings array. Optional -- if not provided, first group by option will be used", + "type": "string" + }, + "measurements_display": { + "description": "Dictates how the measurements are displayed, either as the raw data points or as the means of values grouped by the tree color-by attribute", + "type": "string", + "enum": ["raw", "mean"] + }, + "show_overall_mean": { + "description": "Should the overall mean per group be displayed by default?", + "type": "boolean" + }, + "show_threshold": { + "description": "Should the threshold line be displayed by default? Ignored if no threshold has been provided for collection", + "type": "boolean" + } + } + } + } +} From 90688e39dd11e503ab4b117a7e9abdfe2bb777a3 Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 24 Mar 2022 14:21:22 -0700 Subject: [PATCH 03/21] augur validate: add measurements-collection-config subcommand Validates the provided measurements collection config JSON against the JSON schema `augur/data/schema-measurements-collection-config.json` --- augur/validate.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/augur/validate.py b/augur/validate.py index b0e516685..94186c7a2 100644 --- a/augur/validate.py +++ b/augur/validate.py @@ -126,6 +126,13 @@ def measurements(measurements_json, **kwargs): return measurements +def measurements_collection_config(collection_config_json, **kwargs): + schema = load_json_schema("schema-measurements-collection-config.json") + collection_config = load_json(collection_config_json) + validate_json(collection_config, schema, collection_config_json) + return collection_config + + def register_arguments(parser): subparsers = parser.add_subparsers(dest="subcommand", help="Which file(s) do you want to validate?") @@ -142,6 +149,9 @@ def register_arguments(parser): subparsers.add_parser("measurements", help="validate measurements JSON intended for auspice measurements panel") \ .add_argument("measurements_json", metavar="JSON", help="exported measurements JSON") + subparsers.add_parser("measurements-collection-config", help="validate measurement collection config intended for `augur measurements export`") \ + .add_argument("collection_config_json", metavar="JSON", help="collection config JSON") + def run(args): try: globals()[args.subcommand.replace('-','_')](**vars(args)) From 32db8dcd2cc0776a50a8eb5f2d456022942b0a23 Mon Sep 17 00:00:00 2001 From: Jover Date: Mon, 28 Mar 2022 17:41:55 -0700 Subject: [PATCH 04/21] validate measurements: add validations not checked by schema Adds validations for constraints on the values within measurements and collection config JSONs that cannot be checked via JSON schema validation. These include: 1. A collection's fields, groupings, and filters are valid fields in the collection's measurements. 2. A collection's display default group-by is included in the groupings 3. All collections within measurements JSON have unique keys 4. The default collection value is a valid key that matches one of the the collections --- augur/validate.py | 155 +++++++++++++++++++++++++++++++++++++++++ tests/test_validate.py | 90 ++++++++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 tests/test_validate.py diff --git a/augur/validate.py b/augur/validate.py index 94186c7a2..dac61ec0a 100644 --- a/augur/validate.py +++ b/augur/validate.py @@ -119,10 +119,163 @@ def export_v1(meta_json, tree_json, **kwargs): print("Validation of {!r} and {!r} succeeded, but there were warnings you may want to resolve.".format(meta_json, tree_json)) +def get_unique_keys(list_of_dicts): + """ + Returns a set of unique keys from a list of dicts + + >>> list_of_dicts = [{"key1": "val1", "key2": "val2"}, {"key1": "val1", "key3": "val3"}] + >>> sorted(get_unique_keys(list_of_dicts)) + ['key1', 'key2', 'key3'] + """ + return set().union(*(single_dict.keys() for single_dict in list_of_dicts)) + + +def validate_collection_config_fields(collection, index=None): + """ + Validates a single collection's config field keys provided in fields, + groupings, and filters are valid fields that exist in measurements' fields. + + Prints any validation errors to stderr. + + Parameters + ---------- + collection: dict + A single collection to validate. Assumes that the collection has already passed the schema validation. + index: int, optional + the index of the collection within a list of collections in a measurements JSON. + Used to print more detailed error messages. + + Returns + ------- + bool + True if collection's config is valid + """ + valid_collection_config_fields = True + nested_config_fields = ['fields', 'groupings'] + flat_config_fields = ['filters'] + # Create set of all measurements' fields for verifying field configs + all_measurement_fields = get_unique_keys(collection['measurements']) + + for config_field in (nested_config_fields + flat_config_fields): + invalid_fields = set() + for config_value in collection.get(config_field, []): + # config value can be a field name string (i.e. flat_config_fields) + # or a dict with the field name in 'key' (i.e. nested_config_fields) + field_name = config_value['key'] if config_field in nested_config_fields else config_value + if field_name not in all_measurement_fields: + invalid_fields.add(field_name) + + if invalid_fields: + valid_collection_config_fields = False + include_index = f"(at index {index}) " if index is not None else "" + print( + f"ERROR: Collection {include_index}includes {config_field} that " + + f"do not exist as fields in measurements: {invalid_fields}.", + file=sys.stderr + ) + + return valid_collection_config_fields + + +def validate_collection_display_defaults(collection, index=None): + """ + Validates a single collection's display defaults. If a default group-by + field is provided, the field must be included in groupings. + + Prints validation errors to stderr. + + Parameters + ---------- + collection: dict + A single collection to validate. Assumest htat the collection has already passed the schema validation. + index: int, optional + The index of the collection within a list of collections in a measurements JSON. + Used to print more detailed error messages. + + Returns + ------- + bool + True if collection's display defaults are valid + """ + valid_display_defaults = True + + grouping_fields = {grouping['key'] for grouping in collection['groupings']} + default_grouping = collection.get('display_defaults', {}).get('group_by') + + if default_grouping and default_grouping not in grouping_fields: + valid_display_defaults = False + include_index = f"(at index {index}) " if index is not None else "" + print( + f"ERROR: Collection {include_index}has a default group-by field " + + f"'{default_grouping}' that is not included in the groupings' fields.", + file=sys.stderr + ) + + return valid_display_defaults + + +def validate_measurements_config(measurements): + """ + Validate measurements' config values meet expectations described in the + measurements JSON schema descriptions that cannot be verified via + `validate_json`: + 1. Individual collections have valid config values + 2. All collections have unique keys + 3. If a default collection is provided, it matches one of the collections + + Prints any validation errors to stderr. + + Parameters + ---------- + measurements: dict + Loaded measurements JSON to validate. Assumes the measurements JSON has already passed the schema validation. + + Returns + ------- + bool + True if measurements' config is valid + """ + valid_measurements_config = True + collection_keys = defaultdict(list) + + # First check configs for individual collections + for index, collection in enumerate(measurements['collections']): + # Save the collection key and index of collection to verify unique keys later + collection_keys[collection['key']].append(index) + + if not all([ + validate_collection_config_fields(collection, index), + validate_collection_display_defaults(collection, index) + ]): + valid_measurements_config = False + + # Check collections have unique keys + for collection_key, collection_indexes in collection_keys.items(): + if len(collection_indexes) > 1: + valid_measurements_config = False + print( + f"ERROR: Collections at indexes {collection_indexes} share the same collection key '{collection_key}'.", + file=sys.stderr + ) + + # Check the default collection value matches a collection's key value + default_collection = measurements.get('default_collection') + if default_collection and default_collection not in collection_keys.keys(): + valid_measurements_config = False + print( + f"ERROR: The default collection key «{default_collection}» does not match any of the collections' keys.", + file=sys.stderr + ) + + return valid_measurements_config + + def measurements(measurements_json, **kwargs): schema = load_json_schema("schema-measurements.json") measurements = load_json(measurements_json) validate_json(measurements, schema, measurements_json) + if not validate_measurements_config(measurements): + raise ValidateError("Validation of the measurements' config values failed.") return measurements @@ -130,6 +283,8 @@ def measurements_collection_config(collection_config_json, **kwargs): schema = load_json_schema("schema-measurements-collection-config.json") collection_config = load_json(collection_config_json) validate_json(collection_config, schema, collection_config_json) + if not validate_collection_display_defaults(collection_config): + raise ValidateError("Validation of the collection config display defaults failed.") return collection_config diff --git a/tests/test_validate.py b/tests/test_validate.py new file mode 100644 index 000000000..1caeee20d --- /dev/null +++ b/tests/test_validate.py @@ -0,0 +1,90 @@ +import pytest +import random + +from augur.validate import ( + validate_collection_config_fields, + validate_collection_display_defaults, + validate_measurements_config +) + + +@pytest.fixture +def example_collection_measurements(): + return [ + {"strain": "strain_1", "value": 0, "valid_field_1": "value_1a", "valid_field_2": "value_2a", "valid_field_3": "value_3a"}, + {"strain": "strain_2", "value": 0, "valid_field_1": "value_1b", "valid_field_2": "value_2b", "valid_field_3": "value_3b"}, + {"strain": "strain_3", "value": 0, "valid_field_1": "value_1c", "valid_field_2": "value_2c", "valid_field_3": "value_3c"} + ] + +@pytest.fixture +def example_collection(example_collection_measurements): + return { + "key": "collection_0", + "fields": [ + {"key": "valid_field_1"}, + {"key": "valid_field_2"}, + {"key": "valid_field_3"} + ], + "groupings": [ + {"key": "valid_field_1"}, + {"key": "valid_field_2"}, + {"key": "valid_field_3"} + ], + "filters": ["valid_field_1", "valid_field_2", "valid_field_3"], + "display_defaults": { + "group_by": "valid_field_1" + }, + "measurements": example_collection_measurements + } + +@pytest.fixture +def example_measurements(example_collection): + number_of_collections = 10 + return { + "default_collection": f"collection_{random.randint(0, number_of_collections - 1)}", + "collections": [{**example_collection, "key": f"collection_{x}"} for x in range(number_of_collections)] + } + +class TestValidateMeasurements(): + def test_validate_collection_config_fields_valid(self, example_collection): + assert validate_collection_config_fields(example_collection) + + @pytest.mark.parametrize( + "invalid_config", + [ + {"fields": [{"key": "invalid_field"}]}, + {"groupings": [{"key": "invalid_field"}]}, + {"filters": ["invalid_field"]} + ] + ) + def test_validate_collection_config_fields_invalid(self, invalid_config, example_collection_measurements, capsys): + collection = {**invalid_config, "measurements": example_collection_measurements} + assert not validate_collection_config_fields(collection) + assert capsys.readouterr().err == f"ERROR: Collection includes {next(iter(invalid_config))} that do not exist as fields in measurements: {{'invalid_field'}}.\n" + + def test_validate_collection_display_defaults_valid(self, example_collection): + assert validate_collection_display_defaults(example_collection) + + def test_validate_collection_display_defaults_invalid(self, example_collection, capsys): + collection = {**example_collection} + collection["display_defaults"]["group_by"] = "invalid_field" + assert not validate_collection_display_defaults(collection) + assert capsys.readouterr().err == "ERROR: Collection has a default group-by field 'invalid_field' that is not included in the groupings' fields.\n" + + def test_validate_measurements_config_valid(self, example_measurements): + assert validate_measurements_config(example_measurements) + + def test_validate_measurements_config_duplicate_collection_keys(self, example_collection, capsys): + measurements = { + "collections": [example_collection] * 2 + } + assert not validate_measurements_config(measurements) + assert capsys.readouterr().err == "ERROR: Collections at indexes [0, 1] share the same collection key 'collection_0'.\n" + + def test_validate_measurements_config_invalid_default_collection(self, example_measurements, capsys): + measurements = { + **example_measurements, + "default_collection": "invalid_collection" + } + assert not validate_measurements_config(measurements) + assert capsys.readouterr().err == "ERROR: The default collection key «invalid_collection» does not match any of the collections' keys.\n" From a1c2c031775affb786d64e8ad9ee05bf179c54bd Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 30 Mar 2022 13:07:51 -0700 Subject: [PATCH 05/21] Add augur measurements export subcommand The `augur measurements export` subcommand creates the measurements JSON for a single collection of measurements provided in a TSV. The most basic measurements export command takes the command-line options for the required fields to create the minimal measurements JSON. --- augur/__init__.py | 3 +- augur/measurements.py | 194 ++++++++++++++++++ tests/functional/measurements_export.t | 54 +++++ .../measurements_export/collection.tsv | 4 + ...ollection_without_strain_value_columns.tsv | 4 + .../minimal_measurements.json | 36 ++++ 6 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 augur/measurements.py create mode 100644 tests/functional/measurements_export.t create mode 100644 tests/functional/measurements_export/collection.tsv create mode 100644 tests/functional/measurements_export/collection_without_strain_value_columns.tsv create mode 100644 tests/functional/measurements_export/minimal_measurements.json diff --git a/augur/__init__.py b/augur/__init__.py index cabb21e67..31ee68261 100644 --- a/augur/__init__.py +++ b/augur/__init__.py @@ -40,7 +40,8 @@ "export", "validate", "version", - "import" + "import", + "measurements", ] COMMANDS = [importlib.import_module('augur.' + c) for c in command_strings] diff --git a/augur/measurements.py b/augur/measurements.py new file mode 100644 index 000000000..dcb1b0e16 --- /dev/null +++ b/augur/measurements.py @@ -0,0 +1,194 @@ +""" +Create JSON files suitable for visualization within the measurements panel of Auspice. +""" +import os +import pandas as pd +import sys + +from .utils import write_json +from .validate import measurements as validate_measurements_json, ValidateError + + +def column_exists(collection, column, column_purpose): + """ + Checks the provided column exists in the provided collection. + Prints an error message to stderr if the column does not exist. + + Parameters + ---------- + collection: pandas.DataFrame + Collection of measurements and metadata + column: str + Column to check exists in the collection + column_purpose: str + Purpose of provided column for detailed error message + + Returns + ------- + bool + True if column exists in collection + """ + column_in_df = column in collection.columns + if not column_in_df: + print( + f"ERROR: Provided {column_purpose} column «{column}» does not exist in collection TSV.", + file=sys.stderr, + ) + return column_in_df + + +def load_collection(collection, strain_column, value_column): + """ + Loads the provided collection TSV as a pandas DataFrame. + Renames the provided strain and value columns if needed and ensures the + value column has a numeric dtype. + + Prints any error messages to stderr. + + Parameters + ---------- + collection: str + Filepath to the collection TSV file + strain_column: str + The name of the strain column within the collection TSV + value_column: str + The name of the value column within the collection TSV + + Returns + ------- + pandas.DataFrame or None + The collection DataFrame or None if any errors were encountered during loading + """ + try: + collection_df = pd.read_csv(collection, sep="\t") + except FileNotFoundError: + print( + f"ERROR: collection TSV file ({collection}) does not exist", + file=sys.stderr, + ) + return None + + # Verify the strain and value columns are different + if strain_column == value_column: + print( + "ERROR: The strain column and value column cannot be the same column.", + file=sys.stderr + ) + return None + + # Define mapping of requried columns to user provided columns + required_column_map = { + strain_column: 'strain', + value_column: 'value', + } + + # Check all required columns are included in collection TSV + checks_passed = True + for provided_column, required_column in required_column_map.items(): + # Confirm the provided column exists + if not column_exists(collection_df, provided_column, required_column): + checks_passed = False + # Confirm the provided column does not overwrite an existing column + if (required_column in collection_df.columns and + provided_column != required_column): + print( + f"ERROR: Cannot use provided '{provided_column}' column as the {required_column} column " + + f"because a '{required_column}' column already exists in collection TSV.", + file=sys.stderr, + ) + checks_passed = False + + if not checks_passed: + return None + + # Rename user provided columns to expected columns + collection_df = collection_df.rename(columns=required_column_map) + + # Make sure the value column is numeric + try: + collection_df['value'] = pd.to_numeric(collection_df['value']) + except ValueError as e: + print(f"ERROR: Found a non-numeric measurement value: {e}", file=sys.stderr) + return None + + return collection_df + + +def validate_output_json(output_json): + """ + Validate the output JSON against the measurements schema + + Parameters + ---------- + output_json: str + Filepath to output JSON + + """ + print("Validating produced measurements JSON") + try: + validate_measurements_json(measurements_json=output_json) + except ValidateError: + print( + "ERROR: Validation of output JSON failed. See detailed errors above.", + file=sys.stderr, + ) + sys.exit(1) + + +def export_measurements(args): + # Load input collection TSV file + collection_df = load_collection(args['collection'], args['strain_column'], args['value_column']) + + if collection_df is None: + print("ERROR: Loading of collection TSV was unsuccessful. See detailed errors above.", file=sys.stderr) + sys.exit(1) + + # Create collection output object with required keys + collection_output = { + 'key': os.path.basename(args['collection']), + 'groupings': [{'key': col} for col in args['grouping_column'] if column_exists(collection_df, col, "grouping")], + 'x_axis_label': 'measurement values', + 'measurements': collection_df.to_dict(orient='records') + } + + # Create final output with single collection + output = { + 'collections': [collection_output] + } + + # Set indentation to None to create compact JSON if specified + indent = {"indent": None} if args['minify_json'] else {} + # Create output JSON + write_json(output, args['output_json'], include_version=False, **indent) + # Verify the produced output is a valid measurements JSON + validate_output_json(args['output_json']) + + +def register_arguments(parser): + subparsers = parser.add_subparsers(dest='subcommand') + subparsers.required = True + + export = subparsers.add_parser("export", help="Export a measurements JSON for a single collection") + export.add_argument("--collection", required=True, metavar="TSV", + help="Collection of measurements and metadata in a TSV file. " + + "Keep in mind duplicate columns will be renamed as 'X', 'X.1', 'X.2'...'X.N'") + export.add_argument("--strain-column", default="strain", + help="Name of the column containing strain names. " + + "Provided column will be renamed to `strain` so please make sure no other columns are named `strain`. " + + "Strain names in this column should match the strain names in the corresponding Auspice dataset JSON.") + export.add_argument("--value-column", default="value", + help="Name of the column containing the numeric values to be plotted for the given collection. " + + "Provided column will be renamed to `value` so please make sure no other columns are named `value`. ") + export.add_argument("--grouping-column", required=True, nargs="+", + help="Name of the column(s) that should be used as grouping(s) for measurements.") + export.add_argument("--minify-json", action="store_true", + help="Export JSON without indentation or line returns.") + export.add_argument("--output-json", required=True, metavar="JSON", type=str, + help="Output JSON file. " + + "The file name must follow the Auspice sidecar file naming convention to be recognized as a sidecar file. " + + "See Nextstrain data format docs for more details.") + + +def run(args): + if args.subcommand == 'export': + return export_measurements(vars(args)) diff --git a/tests/functional/measurements_export.t b/tests/functional/measurements_export.t new file mode 100644 index 000000000..e604fd756 --- /dev/null +++ b/tests/functional/measurements_export.t @@ -0,0 +1,54 @@ +Integration tests for augur measurements export. + + $ pushd "$TESTDIR" > /dev/null + $ export AUGUR="../../bin/augur" + +Minimal measurements export with existing strain and value columns. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection.tsv \ + > --grouping-column field_1 \ + > --output-json "$TMP/minimal_measurements.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" measurements_export/minimal_measurements.json "$TMP/minimal_measurements.json" + {} + +Minimal measurements export with user provided strain and value columns. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection_without_strain_value_columns.tsv \ + > --strain-column strain_field \ + > --value-column value_field \ + > --grouping-column field_1 \ + > --output-json "$TMP/minimal_measurements.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" measurements_export/minimal_measurements.json "$TMP/minimal_measurements.json" \ + > --exclude-paths "root['collections'][0]['key']" + {} + +Try measurements export with user provided strain and value columns that would overwrite existing columns. +This is expected to fail. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection.tsv \ + > --strain-column field_1 \ + > --value-column field_2 \ + > --grouping-column field_1 \ + > --output-json "$TMP/minimal_measurements.json" + ERROR: Cannot use provided 'field_1' column as the strain column because a 'strain' column already exists in collection TSV. + ERROR: Cannot use provided 'field_2' column as the value column because a 'value' column already exists in collection TSV. + ERROR: Loading of collection TSV was unsuccessful. See detailed errors above. + [1] + +Try measurements export with user provided strain and value columns that are the same column. +This is expected to fail. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection_without_strain_value_columns.tsv \ + > --strain-column field_1 \ + > --value-column field_1 \ + > --grouping-column field_1 \ + > --output-json "$TMP/minimal_measurements.json" + ERROR: The strain column and value column cannot be the same column. + ERROR: Loading of collection TSV was unsuccessful. See detailed errors above. + [1] diff --git a/tests/functional/measurements_export/collection.tsv b/tests/functional/measurements_export/collection.tsv new file mode 100644 index 000000000..55d45e894 --- /dev/null +++ b/tests/functional/measurements_export/collection.tsv @@ -0,0 +1,4 @@ +strain value field_1 field_2 field_3 +strain_1 1.0 value_1 value_1 value_1 +strain_2 2.0 value_2 value_2 value_2 +strain_3 3.0 value_3 value_3 value_3 diff --git a/tests/functional/measurements_export/collection_without_strain_value_columns.tsv b/tests/functional/measurements_export/collection_without_strain_value_columns.tsv new file mode 100644 index 000000000..f3bc05295 --- /dev/null +++ b/tests/functional/measurements_export/collection_without_strain_value_columns.tsv @@ -0,0 +1,4 @@ +strain_field value_field field_1 field_2 field_3 +strain_1 1.0 value_1 value_1 value_1 +strain_2 2.0 value_2 value_2 value_2 +strain_3 3.0 value_3 value_3 value_3 diff --git a/tests/functional/measurements_export/minimal_measurements.json b/tests/functional/measurements_export/minimal_measurements.json new file mode 100644 index 000000000..a2e2e8423 --- /dev/null +++ b/tests/functional/measurements_export/minimal_measurements.json @@ -0,0 +1,36 @@ +{ + "collections": [ + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection.tsv", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + } + ] +} \ No newline at end of file From b62477b3108f65c7275b96587e5871278b960442 Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 30 Mar 2022 18:40:29 -0700 Subject: [PATCH 06/21] augur measurements export: add advanced options Add ability to provide config options via a collection config JSON or via command line args. The config JSON includes all available configs but there are only command line args for configs that have 1:1 options. Nested options such as fields' titles and groupings' orders have been excluded to reduce complexity of command line args. Similar to the auspice export command, the command line args will override the values of the config JSON. --- augur/measurements.py | 173 ++++++++++++++++-- tests/functional/measurements_export.t | 49 +++++ .../collection_config.json | 48 +++++ ...gle_collection_with_args_measurements.json | 51 ++++++ ...e_collection_with_config_measurements.json | 75 ++++++++ ...ollection_with_overrides_measurements.json | 61 ++++++ 6 files changed, 441 insertions(+), 16 deletions(-) create mode 100644 tests/functional/measurements_export/collection_config.json create mode 100644 tests/functional/measurements_export/single_collection_with_args_measurements.json create mode 100644 tests/functional/measurements_export/single_collection_with_config_measurements.json create mode 100644 tests/functional/measurements_export/single_collection_with_overrides_measurements.json diff --git a/augur/measurements.py b/augur/measurements.py index dcb1b0e16..2c47a8353 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -1,12 +1,25 @@ """ Create JSON files suitable for visualization within the measurements panel of Auspice. """ +import argparse import os import pandas as pd import sys from .utils import write_json -from .validate import measurements as validate_measurements_json, ValidateError +from .validate import ( + measurements as validate_measurements_json, + measurements_collection_config as validate_collection_config_json, + ValidateError +) + +class HideAsFalseAction(argparse.Action): + """ + Custom argparse Action that stores False for arguments passed as `--hide*` + and stores True for all other argument patterns. + """ + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, option_string[2:6] != 'hide') def column_exists(collection, column, column_purpose): @@ -114,6 +127,56 @@ def load_collection(collection, strain_column, value_column): return collection_df +def get_collection_groupings(collection, grouping_columns): + """ + Creates the groupings for the provided collection using the provided + grouping columns after verifying the columns exist in the collection. + + Parameters + ---------- + collection: pandas.DataFrame + The collection used to validate groupings + grouping_columns: list[str] + List of grouping column names + + Returns + ------- + list[dict] or None + The groupings for the collection config or None if all grouping columns are invalid + """ + groupings = [{'key': col} for col in grouping_columns if column_exists(collection, col, "grouping")] + + if not groupings: + print("ERROR: Provided grouping columns were invalid for provided collection.", file=sys.stderr) + return None + + return groupings + + +def override_config_with_args(config, args): + """ + Overrides values in the config with values of provided command line args. + + Parameter + --------- + config: dict + A collection config + args: dict + The __dict__ attribute of the parsed arguments from argparse + """ + config_key_args = ['key', 'title', 'filters', 'x_axis_label', 'threshold'] + display_default_args = ['group_by', 'measurements_display', 'show_overall_mean', 'show_threshold'] + + for key_arg in config_key_args: + if args.get(key_arg) is not None: + config[key_arg] = args[key_arg] + + for default_arg in display_default_args: + if args.get(default_arg) is not None: + config['display_defaults'] = config.get('display_defaults', {}) + config['display_defaults'][default_arg] = args[default_arg] + + def validate_output_json(output_json): """ Validate the output JSON against the measurements schema @@ -143,12 +206,42 @@ def export_measurements(args): print("ERROR: Loading of collection TSV was unsuccessful. See detailed errors above.", file=sys.stderr) sys.exit(1) - # Create collection output object with required keys + collection_config = {} + if args.get('collection_config'): + try: + collection_config = validate_collection_config_json(args['collection_config']) + except ValidateError: + print( + f"Validation of provided collection config JSON {args['collection_config']} failed. " + + "Please check the formatting of this file.", + file=sys.stderr + ) + sys.exit(1) + + groupings = collection_config.pop('groupings', None) + if args.get('grouping_column'): + groupings = get_collection_groupings(collection_df, args['grouping_column']) + if collection_config.get('display_defaults', {}).pop('group_by', None): + print( + "WARNING: The default group-by in the collection config has been removed " + + "because new groupings have been provided via the --grouping-column option.", + file=sys.stderr + ) + + if not groupings: + print("ERROR: Cannot create measurements JSON without valid groupings", file=sys.stderr) + sys.exit(1) + + # Combine collection config with command line args + override_config_with_args(collection_config, args) + + # Create collection output object with default values for required keys collection_output = { - 'key': os.path.basename(args['collection']), - 'groupings': [{'key': col} for col in args['grouping_column'] if column_exists(collection_df, col, "grouping")], - 'x_axis_label': 'measurement values', - 'measurements': collection_df.to_dict(orient='records') + 'key': collection_config.pop('key', os.path.basename(args['collection'])), + 'groupings': groupings, + 'x_axis_label': collection_config.pop('x_axis_label', 'measurement values'), + 'measurements': collection_df.to_dict(orient='records'), + **collection_config } # Create final output with single collection @@ -169,25 +262,73 @@ def register_arguments(parser): subparsers.required = True export = subparsers.add_parser("export", help="Export a measurements JSON for a single collection") - export.add_argument("--collection", required=True, metavar="TSV", + + required = export.add_argument_group( + title="REQUIRED" + ) + required.add_argument("--collection", required=True, metavar="TSV", help="Collection of measurements and metadata in a TSV file. " + "Keep in mind duplicate columns will be renamed as 'X', 'X.1', 'X.2'...'X.N'") - export.add_argument("--strain-column", default="strain", + required.add_argument("--strain-column", default="strain", help="Name of the column containing strain names. " + "Provided column will be renamed to `strain` so please make sure no other columns are named `strain`. " + - "Strain names in this column should match the strain names in the corresponding Auspice dataset JSON.") - export.add_argument("--value-column", default="value", + "Strain names in this column should match the strain names in the corresponding Auspice dataset JSON. " + + "(default: %(default)s)") + required.add_argument("--value-column", default="value", help="Name of the column containing the numeric values to be plotted for the given collection. " + - "Provided column will be renamed to `value` so please make sure no other columns are named `value`. ") - export.add_argument("--grouping-column", required=True, nargs="+", - help="Name of the column(s) that should be used as grouping(s) for measurements.") - export.add_argument("--minify-json", action="store_true", - help="Export JSON without indentation or line returns.") - export.add_argument("--output-json", required=True, metavar="JSON", type=str, + "Provided column will be renamed to `value` so please make sure no other columns are named `value`. " + + "(default: %(default)s)") + required.add_argument("--output-json", required=True, metavar="JSON", type=str, help="Output JSON file. " + "The file name must follow the Auspice sidecar file naming convention to be recognized as a sidecar file. " + "See Nextstrain data format docs for more details.") + config = export.add_argument_group( + title="COLLECTION CONFIGURATION", + description="These options control the configuration of the collection for auspice." + + "You can provide a config JSON (which includes all available options) or " + + "command line arguments (which are more limited). " + + "Command line arguments will override the values set in the config JSON." + ) + config.add_argument("--collection-config", metavar="JSON", + help="Collection configuration file for advanced configurations. ") + config.add_argument("--grouping-column", nargs="+", + help="Name of the column(s) that should be used as grouping(s) for measurements. " + + "Note that if groupings are provided via command line args, the default group-by " + + "field in the config JSON will be dropped.") + config.add_argument("--key", + help="A short key name of the collection for internal use within auspice." + + "If not provided via config or command line option, the collection TSV filename will be used. ") + config.add_argument("--title", + help="The full title of the collection to display in the measurements panel title. " + + "If not provided via config or command line option, the panel's default title is 'Measurements'.") + config.add_argument("--x-axis-label", + help="The short label to display for the x-axis that describles the value of the measurements. " + + "If not provided via config or command line option, the panel's default x-axis label is 'measurements values'.") + config.add_argument("--threshold", type=float, + help="A measurements value threshold to be displayed as a single grey line shared across subplots.") + config.add_argument("--filters", nargs="+", + help="The columns that are to be used a filters for measurements. " + + "If not provided, all columns will be available as filters.") + config.add_argument("--group-by", type=str, + help="The default grouping column. If not provided, the first grouping will be used.") + config.add_argument("--measurements-display", type=str, choices=["raw", "mean"], + help="The default display of the measurements") + + config.add_argument("--show-overall-mean", "--hide-overall-mean", + dest="show_overall_mean", action=HideAsFalseAction, nargs=0, + help="Show or hide the overall mean per group by default") + config.add_argument("--show-threshold", "--hide-threshold", + dest="show_threshold", action=HideAsFalseAction, nargs=0, + help="Show or hide the threshold by default. This will be ignored if no threshold is provided.") + + optional_settings = export.add_argument_group( + title="OPTIONAL SETTINGS" + ) + optional_settings.add_argument("--minify-json", action="store_true", + help="Export JSON without indentation or line returns.") + + def run(args): if args.subcommand == 'export': diff --git a/tests/functional/measurements_export.t b/tests/functional/measurements_export.t index e604fd756..8be8b0fa9 100644 --- a/tests/functional/measurements_export.t +++ b/tests/functional/measurements_export.t @@ -52,3 +52,52 @@ This is expected to fail. ERROR: The strain column and value column cannot be the same column. ERROR: Loading of collection TSV was unsuccessful. See detailed errors above. [1] + +Measurements export for a single collection using only command line configs. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection.tsv \ + > --grouping-column field_1 field_2 \ + > --key args-collection \ + > --title collection-display-title \ + > --x-axis-label label \ + > --threshold 2.0 \ + > --filters field_1 field_2 \ + > --group-by field_1 \ + > --measurements-display mean \ + > --show-overall-mean \ + > --show-threshold \ + > --output-json "$TMP/single_collection_with_args_measurements.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" measurements_export/single_collection_with_args_measurements.json "$TMP/single_collection_with_args_measurements.json" + {} + +Measurements export for a single collection using a collection config. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection.tsv \ + > --collection-config measurements_export/collection_config.json \ + > --output-json "$TMP/single_collection_with_config_measurements.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" measurements_export/single_collection_with_config_measurements.json "$TMP/single_collection_with_config_measurements.json" + {} + +Measurements export for a single collection using a collection config and command-line overrides. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection.tsv \ + > --collection-config measurements_export/collection_config.json \ + > --grouping-column field_3 \ + > --key override-collection \ + > --title override-collection-display-title \ + > --x-axis-label override-label \ + > --threshold 10.0 \ + > --filters field_3 \ + > --group-by field_3 \ + > --measurements-display raw \ + > --hide-overall-mean \ + > --hide-threshold \ + > --output-json "$TMP/single_collection_with_overrides_measurements.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" measurements_export/single_collection_with_overrides_measurements.json "$TMP/single_collection_with_overrides_measurements.json" + {} diff --git a/tests/functional/measurements_export/collection_config.json b/tests/functional/measurements_export/collection_config.json new file mode 100644 index 000000000..f04320301 --- /dev/null +++ b/tests/functional/measurements_export/collection_config.json @@ -0,0 +1,48 @@ +{ + "key": "config-collection", + "title": "collection-display-title", + "fields": [ + { + "key": "field_1", + "title": "field_title_1" + }, + { + "key": "field_2", + "title": "field_title_2" + }, + { + "key": "field_3", + "title": "field_title_3" + } + ], + "groupings": [ + { + "key": "field_1", + "order": [ + "value_3", + "value_1", + "value_2" + ] + }, + { + "key": "field_2", + "order": [ + "value_2", + "value_3", + "value_1" + ] + } + ], + "filters": [ + "field_1", + "field_2" + ], + "x_axis_label": "label", + "threshold": 2.0, + "display_defaults": { + "group_by": "field_1", + "measurements_display": "mean", + "show_overall_mean": true, + "show_threshold": true + } +} diff --git a/tests/functional/measurements_export/single_collection_with_args_measurements.json b/tests/functional/measurements_export/single_collection_with_args_measurements.json new file mode 100644 index 000000000..9dc47d750 --- /dev/null +++ b/tests/functional/measurements_export/single_collection_with_args_measurements.json @@ -0,0 +1,51 @@ +{ + "collections": [ + { + "display_defaults": { + "group_by": "field_1", + "measurements_display": "mean", + "show_overall_mean": true, + "show_threshold": true + }, + "filters": [ + "field_1", + "field_2" + ], + "groupings": [ + { + "key": "field_1" + }, + { + "key": "field_2" + } + ], + "key": "args-collection", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "threshold": 2.0, + "title": "collection-display-title", + "x_axis_label": "label" + } + ] +} diff --git a/tests/functional/measurements_export/single_collection_with_config_measurements.json b/tests/functional/measurements_export/single_collection_with_config_measurements.json new file mode 100644 index 000000000..484dfcb2a --- /dev/null +++ b/tests/functional/measurements_export/single_collection_with_config_measurements.json @@ -0,0 +1,75 @@ +{ + "collections": [ + { + "display_defaults": { + "group_by": "field_1", + "measurements_display": "mean", + "show_overall_mean": true, + "show_threshold": true + }, + "fields": [ + { + "key": "field_1", + "title": "field_title_1" + }, + { + "key": "field_2", + "title": "field_title_2" + }, + { + "key": "field_3", + "title": "field_title_3" + } + ], + "filters": [ + "field_1", + "field_2" + ], + "groupings": [ + { + "key": "field_1", + "order": [ + "value_3", + "value_1", + "value_2" + ] + }, + { + "key": "field_2", + "order": [ + "value_2", + "value_3", + "value_1" + ] + } + ], + "key": "config-collection", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "threshold": 2.0, + "title": "collection-display-title", + "x_axis_label": "label" + } + ] +} diff --git a/tests/functional/measurements_export/single_collection_with_overrides_measurements.json b/tests/functional/measurements_export/single_collection_with_overrides_measurements.json new file mode 100644 index 000000000..96d51358c --- /dev/null +++ b/tests/functional/measurements_export/single_collection_with_overrides_measurements.json @@ -0,0 +1,61 @@ +{ + "collections": [ + { + "display_defaults": { + "group_by": "field_3", + "measurements_display": "raw", + "show_overall_mean": false, + "show_threshold": false + }, + "fields": [ + { + "key": "field_1", + "title": "field_title_1" + }, + { + "key": "field_2", + "title": "field_title_2" + }, + { + "key": "field_3", + "title": "field_title_3" + } + ], + "filters": [ + "field_3" + ], + "groupings": [ + { + "key": "field_3" + } + ], + "key": "override-collection", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "threshold": 10.0, + "title": "override-collection-display-title", + "x_axis_label": "override-label" + } + ] +} From 3eccec4f5c0c73b10188f16d5b6cb63e29f5be38 Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 30 Mar 2022 19:38:41 -0700 Subject: [PATCH 07/21] Add augur measurements concat subcommand The `augur measurements concat` subcommand concatentates multiple measurements JSONs into a single measurements JSON. Depends on the measurements validation to verify each measurements JSON is valid and the final produced measurments JSON is valid. --- augur/measurements.py | 29 +++++ tests/functional/measurements_concat.t | 42 ++++++++ .../multiple_collections_measurements.json | 101 ++++++++++++++++++ .../single_collection_measurements_1.json | 36 +++++++ .../single_collection_measurements_2.json | 36 +++++++ .../single_collection_measurements_3.json | 36 +++++++ .../two_collections_measurements.json | 69 ++++++++++++ 7 files changed, 349 insertions(+) create mode 100644 tests/functional/measurements_concat.t create mode 100644 tests/functional/measurements_concat/multiple_collections_measurements.json create mode 100644 tests/functional/measurements_concat/single_collection_measurements_1.json create mode 100644 tests/functional/measurements_concat/single_collection_measurements_2.json create mode 100644 tests/functional/measurements_concat/single_collection_measurements_3.json create mode 100644 tests/functional/measurements_concat/two_collections_measurements.json diff --git a/augur/measurements.py b/augur/measurements.py index 2c47a8353..c07b6df12 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -257,6 +257,22 @@ def export_measurements(args): validate_output_json(args['output_json']) +def concat_measurements(args): + output = { + 'collections': [] + } + if args.get("default_collection"): + output['default_collection'] = args['default_collection'] + + for json in args['jsons']: + measurements = validate_measurements_json(json) + output['collections'].extend(measurements['collections']) + + indent = {"indent": None} if args['minify_json'] else {} + write_json(output, args['output_json'], include_version=False, **indent) + validate_output_json(args['output_json']) + + def register_arguments(parser): subparsers = parser.add_subparsers(dest='subcommand') subparsers.required = True @@ -329,7 +345,20 @@ def register_arguments(parser): help="Export JSON without indentation or line returns.") + concat = subparsers.add_parser("concat", help="Concatenate multiple measurements JSONs into a single JSON file") + concat.add_argument("--jsons", required=True, type=str, nargs="+", metavar="JSONs", + help="Measurement JSON files to concatenate.") + concat.add_argument("--default-collection", type=str, + help="The key of the default collection to display. " + + "If not provided, the first collection of the first JSON file will be displayed") + concat.add_argument("--minify-json", action="store_true", + help="Concat JSONs without indentation or line returns.") + concat.add_argument("--output-json", required=True, metavar="JSON", type=str, + help="Output JSON file") + def run(args): if args.subcommand == 'export': return export_measurements(vars(args)) + if args.subcommand == "concat": + return concat_measurements(vars(args)) diff --git a/tests/functional/measurements_concat.t b/tests/functional/measurements_concat.t new file mode 100644 index 000000000..1f5b50ce5 --- /dev/null +++ b/tests/functional/measurements_concat.t @@ -0,0 +1,42 @@ +Integration tests for augur measurements export. + + $ pushd "$TESTDIR" > /dev/null + $ export AUGUR="../../bin/augur" + +Measurements concat for two measurements JSONs, each with a single collection. + + $ ${AUGUR} measurements concat \ + > --jsons measurements_concat/single_collection_measurements_1.json measurements_concat/single_collection_measurements_2.json \ + > --default-collection collection_1 \ + > --output-json "$TMP/two_collections_measurements.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" measurements_concat/two_collections_measurements.json "$TMP/two_collections_measurements.json" + {} + +Measurements concat for two measurements JSONs, where one has multiple collections. + + $ ${AUGUR} measurements concat \ + > --jsons measurements_concat/two_collections_measurements.json measurements_concat/single_collection_measurements_3.json \ + > --default-collection collection_1 \ + > --output-json "$TMP/multiple_collections_measurements.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" measurements_concat/multiple_collections_measurements.json "$TMP/multiple_collections_measurements.json" + {} + +Measurements concat for measurements JSONs that have collections that share the same key. +This is expected to fail. + + $ ${AUGUR} measurements concat \ + > --jsons measurements_concat/single_collection_measurements_1.json measurements_concat/single_collection_measurements_1.json \ + > --default-collection collection_1 \ + > --output-json "$TMP/multiple_collections_measurements.json" &>/dev/null + [1] + +Measurements concat with an invalid default collection. +This is expected to fail. + + $ ${AUGUR} measurements concat \ + > --jsons measurements_concat/single_collection_measurements_1.json measurements_concat/single_collection_measurements_2.json \ + > --default-collection collection_3 \ + > --output-json "$TMP/multiple_collections_measurements.json" &>/dev/null + [1] diff --git a/tests/functional/measurements_concat/multiple_collections_measurements.json b/tests/functional/measurements_concat/multiple_collections_measurements.json new file mode 100644 index 000000000..5026fa3c8 --- /dev/null +++ b/tests/functional/measurements_concat/multiple_collections_measurements.json @@ -0,0 +1,101 @@ +{ + "collections": [ + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_1", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + }, + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_2", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + }, + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_3", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + } + ], + "default_collection": "collection_1" +} \ No newline at end of file diff --git a/tests/functional/measurements_concat/single_collection_measurements_1.json b/tests/functional/measurements_concat/single_collection_measurements_1.json new file mode 100644 index 000000000..ac71c8c9e --- /dev/null +++ b/tests/functional/measurements_concat/single_collection_measurements_1.json @@ -0,0 +1,36 @@ +{ + "collections": [ + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_1", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + } + ] +} diff --git a/tests/functional/measurements_concat/single_collection_measurements_2.json b/tests/functional/measurements_concat/single_collection_measurements_2.json new file mode 100644 index 000000000..47cb18ff4 --- /dev/null +++ b/tests/functional/measurements_concat/single_collection_measurements_2.json @@ -0,0 +1,36 @@ +{ + "collections": [ + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_2", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + } + ] +} diff --git a/tests/functional/measurements_concat/single_collection_measurements_3.json b/tests/functional/measurements_concat/single_collection_measurements_3.json new file mode 100644 index 000000000..790f948f5 --- /dev/null +++ b/tests/functional/measurements_concat/single_collection_measurements_3.json @@ -0,0 +1,36 @@ +{ + "collections": [ + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_3", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + } + ] +} diff --git a/tests/functional/measurements_concat/two_collections_measurements.json b/tests/functional/measurements_concat/two_collections_measurements.json new file mode 100644 index 000000000..069b5e7cb --- /dev/null +++ b/tests/functional/measurements_concat/two_collections_measurements.json @@ -0,0 +1,69 @@ +{ + "collections": [ + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_1", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + }, + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_2", + "measurements": [ + { + "field_1": "value_1", + "field_2": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_2": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_2": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "x_axis_label": "measurement values" + } + ], + "default_collection": "collection_1" +} \ No newline at end of file From 2a8c0d918344c6bd80627a232decb51611d1ec3a Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 7 Apr 2022 15:16:44 -0700 Subject: [PATCH 08/21] Add autogenerated API documentation for augur.measurements --- docs/api/augur.measurements.rst | 7 +++++++ docs/api/augur.rst | 1 + 2 files changed, 8 insertions(+) create mode 100644 docs/api/augur.measurements.rst diff --git a/docs/api/augur.measurements.rst b/docs/api/augur.measurements.rst new file mode 100644 index 000000000..d0eb32409 --- /dev/null +++ b/docs/api/augur.measurements.rst @@ -0,0 +1,7 @@ +augur.measurements module +========================= + +.. automodule:: augur.measurements + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/augur.rst b/docs/api/augur.rst index 6b34aae8e..f751d38cf 100644 --- a/docs/api/augur.rst +++ b/docs/api/augur.rst @@ -35,6 +35,7 @@ Submodules augur.io augur.lbi augur.mask + augur.measurements augur.parse augur.reconstruct_sequences augur.refine From 8ea76a612435583669e5f751a2690c7071c2bd15 Mon Sep 17 00:00:00 2001 From: Jover Date: Tue, 19 Apr 2022 17:07:22 -0700 Subject: [PATCH 09/21] measurements: fix docstring for override_config_with_args --- augur/measurements.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index c07b6df12..56184f78e 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -157,8 +157,8 @@ def override_config_with_args(config, args): """ Overrides values in the config with values of provided command line args. - Parameter - --------- + Parameters + ---------- config: dict A collection config args: dict From 4551c44ac6a22073a8c74c07dba5c56800d9da4b Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 29 Apr 2022 17:32:45 -0700 Subject: [PATCH 10/21] measurements/validate: standardize print messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Pass long strings as multiple strings to `print()` and let it handle the formatting of the output with a default space separator so we don't have think about it. 2. Standardizes error messages by removing '«»' in favor of Python's built-in `repr()` formatting with `!r` in f-strings. --- augur/measurements.py | 14 +++++++------- augur/validate.py | 6 +++--- tests/test_validate.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index 56184f78e..6633b2802 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -44,7 +44,7 @@ def column_exists(collection, column, column_purpose): column_in_df = column in collection.columns if not column_in_df: print( - f"ERROR: Provided {column_purpose} column «{column}» does not exist in collection TSV.", + f"ERROR: Provided {column_purpose} column {column!r} does not exist in collection TSV.", file=sys.stderr, ) return column_in_df @@ -76,7 +76,7 @@ def load_collection(collection, strain_column, value_column): collection_df = pd.read_csv(collection, sep="\t") except FileNotFoundError: print( - f"ERROR: collection TSV file ({collection}) does not exist", + f"ERROR: collection TSV file {collection!r} does not exist", file=sys.stderr, ) return None @@ -105,8 +105,8 @@ def load_collection(collection, strain_column, value_column): if (required_column in collection_df.columns and provided_column != required_column): print( - f"ERROR: Cannot use provided '{provided_column}' column as the {required_column} column " + - f"because a '{required_column}' column already exists in collection TSV.", + f"ERROR: Cannot use provided {provided_column!r} column as the {required_column} column", + f"because a {required_column!r} column already exists in collection TSV.", file=sys.stderr, ) checks_passed = False @@ -121,7 +121,7 @@ def load_collection(collection, strain_column, value_column): try: collection_df['value'] = pd.to_numeric(collection_df['value']) except ValueError as e: - print(f"ERROR: Found a non-numeric measurement value: {e}", file=sys.stderr) + print(f"ERROR: Found a non-numeric measurement value: {e!r}", file=sys.stderr) return None return collection_df @@ -212,7 +212,7 @@ def export_measurements(args): collection_config = validate_collection_config_json(args['collection_config']) except ValidateError: print( - f"Validation of provided collection config JSON {args['collection_config']} failed. " + + f"Validation of provided collection config JSON {args['collection_config']!r} failed.", "Please check the formatting of this file.", file=sys.stderr ) @@ -223,7 +223,7 @@ def export_measurements(args): groupings = get_collection_groupings(collection_df, args['grouping_column']) if collection_config.get('display_defaults', {}).pop('group_by', None): print( - "WARNING: The default group-by in the collection config has been removed " + + "WARNING: The default group-by in the collection config has been removed", "because new groupings have been provided via the --grouping-column option.", file=sys.stderr ) diff --git a/augur/validate.py b/augur/validate.py index dac61ec0a..07ca06c5c 100644 --- a/augur/validate.py +++ b/augur/validate.py @@ -169,7 +169,7 @@ def validate_collection_config_fields(collection, index=None): valid_collection_config_fields = False include_index = f"(at index {index}) " if index is not None else "" print( - f"ERROR: Collection {include_index}includes {config_field} that " + + f"ERROR: Collection {include_index}includes {config_field} that", f"do not exist as fields in measurements: {invalid_fields}.", file=sys.stderr ) @@ -206,7 +206,7 @@ def validate_collection_display_defaults(collection, index=None): valid_display_defaults = False include_index = f"(at index {index}) " if index is not None else "" print( - f"ERROR: Collection {include_index}has a default group-by field " + + f"ERROR: Collection {include_index}has a default group-by field", f"'{default_grouping}' that is not included in the groupings' fields.", file=sys.stderr ) @@ -263,7 +263,7 @@ def validate_measurements_config(measurements): if default_collection and default_collection not in collection_keys.keys(): valid_measurements_config = False print( - f"ERROR: The default collection key «{default_collection}» does not match any of the collections' keys.", + f"ERROR: The default collection key {default_collection!r} does not match any of the collections' keys.", file=sys.stderr ) diff --git a/tests/test_validate.py b/tests/test_validate.py index 1caeee20d..427e3947d 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -87,4 +87,4 @@ def test_validate_measurements_config_invalid_default_collection(self, example_m "default_collection": "invalid_collection" } assert not validate_measurements_config(measurements) - assert capsys.readouterr().err == "ERROR: The default collection key «invalid_collection» does not match any of the collections' keys.\n" + assert capsys.readouterr().err == "ERROR: The default collection key 'invalid_collection' does not match any of the collections' keys.\n" From 2e6095d29d5c5f30a661887eeb16e4d4f397482a Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 29 Apr 2022 17:37:50 -0700 Subject: [PATCH 11/21] measurements: edit help messages 1. consistently capitalize "Auspice" 2. fully spell out "Concatenate" 3. add generic description for `--threshold` option --- augur/measurements.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index 6633b2802..acf54fe8d 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -301,7 +301,7 @@ def register_arguments(parser): config = export.add_argument_group( title="COLLECTION CONFIGURATION", - description="These options control the configuration of the collection for auspice." + + description="These options control the configuration of the collection for Auspice. " + "You can provide a config JSON (which includes all available options) or " + "command line arguments (which are more limited). " + "Command line arguments will override the values set in the config JSON." @@ -313,7 +313,7 @@ def register_arguments(parser): "Note that if groupings are provided via command line args, the default group-by " + "field in the config JSON will be dropped.") config.add_argument("--key", - help="A short key name of the collection for internal use within auspice." + + help="A short key name of the collection for internal use within Auspice. " + "If not provided via config or command line option, the collection TSV filename will be used. ") config.add_argument("--title", help="The full title of the collection to display in the measurements panel title. " + @@ -322,7 +322,7 @@ def register_arguments(parser): help="The short label to display for the x-axis that describles the value of the measurements. " + "If not provided via config or command line option, the panel's default x-axis label is 'measurements values'.") config.add_argument("--threshold", type=float, - help="A measurements value threshold to be displayed as a single grey line shared across subplots.") + help="A measurements value threshold to be displayed in the measurements panel.") config.add_argument("--filters", nargs="+", help="The columns that are to be used a filters for measurements. " + "If not provided, all columns will be available as filters.") @@ -352,7 +352,7 @@ def register_arguments(parser): help="The key of the default collection to display. " + "If not provided, the first collection of the first JSON file will be displayed") concat.add_argument("--minify-json", action="store_true", - help="Concat JSONs without indentation or line returns.") + help="Concatenate JSONs without indentation or line returns.") concat.add_argument("--output-json", required=True, metavar="JSON", type=str, help="Output JSON file") From cd6241c79c7f40b60be62aaf52e9defa9d5ed442 Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 4 May 2022 12:38:48 -0700 Subject: [PATCH 12/21] measurements: use global dict to hold default args A subset of optional args (currently just `title` and `x-axis-label`) need a default value for Auspice but we also do not want to overwrite the config file with the argument default values every time. Use a global dict `DEFAULT_ARGS` to hold these default values instead of the argparse `default` argument. --- augur/measurements.py | 16 +++++++++++++--- .../minimal_measurements.json | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index acf54fe8d..ed90324d0 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -13,6 +13,14 @@ ValidateError ) +# Default values for optional arguments that can also be provided via config file +# Setting as global dict instead of using argparse default so that the +# config file does not always get overwritten by the default values +DEFAULT_ARGS = { + 'title': 'Measurements', + 'x_axis_label': 'measurement values', +} + class HideAsFalseAction(argparse.Action): """ Custom argparse Action that stores False for arguments passed as `--hide*` @@ -238,8 +246,9 @@ def export_measurements(args): # Create collection output object with default values for required keys collection_output = { 'key': collection_config.pop('key', os.path.basename(args['collection'])), + 'title': collection_config.pop('title', DEFAULT_ARGS['title']), 'groupings': groupings, - 'x_axis_label': collection_config.pop('x_axis_label', 'measurement values'), + 'x_axis_label': collection_config.pop('x_axis_label', DEFAULT_ARGS['x_axis_label']), 'measurements': collection_df.to_dict(orient='records'), **collection_config } @@ -317,10 +326,11 @@ def register_arguments(parser): "If not provided via config or command line option, the collection TSV filename will be used. ") config.add_argument("--title", help="The full title of the collection to display in the measurements panel title. " + - "If not provided via config or command line option, the panel's default title is 'Measurements'.") + f"If not provided via config or command line option, the panel's default title is {DEFAULT_ARGS['title']!r}.") config.add_argument("--x-axis-label", help="The short label to display for the x-axis that describles the value of the measurements. " + - "If not provided via config or command line option, the panel's default x-axis label is 'measurements values'.") + "If not provided via config or command line option, the panel's default " + + f"x-axis label is {DEFAULT_ARGS['x_axis_label']!r}.") config.add_argument("--threshold", type=float, help="A measurements value threshold to be displayed in the measurements panel.") config.add_argument("--filters", nargs="+", diff --git a/tests/functional/measurements_export/minimal_measurements.json b/tests/functional/measurements_export/minimal_measurements.json index a2e2e8423..8c4ef63ec 100644 --- a/tests/functional/measurements_export/minimal_measurements.json +++ b/tests/functional/measurements_export/minimal_measurements.json @@ -30,6 +30,7 @@ "value": 3.0 } ], + "title": "Measurements", "x_axis_label": "measurement values" } ] From 5cbb89c419e30a67a242ac3b49602d263ed94bc7 Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 5 May 2022 12:53:00 -0700 Subject: [PATCH 13/21] move `HideAsFalseAction` to shared utils module Seems like a handy argparse action that can be used by other modules. Suggested by @huddlej in review --- augur/measurements.py | 11 +---------- augur/utils.py | 8 ++++++++ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index ed90324d0..8b5406326 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -1,12 +1,11 @@ """ Create JSON files suitable for visualization within the measurements panel of Auspice. """ -import argparse import os import pandas as pd import sys -from .utils import write_json +from .utils import write_json, HideAsFalseAction from .validate import ( measurements as validate_measurements_json, measurements_collection_config as validate_collection_config_json, @@ -21,14 +20,6 @@ 'x_axis_label': 'measurement values', } -class HideAsFalseAction(argparse.Action): - """ - Custom argparse Action that stores False for arguments passed as `--hide*` - and stores True for all other argument patterns. - """ - def __call__(self, parser, namespace, values, option_string=None): - setattr(namespace, self.dest, option_string[2:6] != 'hide') - def column_exists(collection, column, column_purpose): """ diff --git a/augur/utils.py b/augur/utils.py index b4b73b54e..27699574b 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -540,3 +540,11 @@ def read_strains(*files, comment_char="#"): strains.add(strain_name) return strains + +class HideAsFalseAction(argparse.Action): + """ + Custom argparse Action that stores False for arguments passed as `--hide*` + and stores True for all other argument patterns. + """ + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, option_string[2:6] != 'hide') From e6a4a3fc1b4a17cfaa79460c342e8e57d82f5cbe Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 May 2022 11:20:50 -0700 Subject: [PATCH 14/21] tests: measurements concat keep stderr output Only write stdout to `/dev/null` so that we can track the expected error messages in the Cram tests. --- tests/functional/measurements_concat.t | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/functional/measurements_concat.t b/tests/functional/measurements_concat.t index 1f5b50ce5..00fe0fe56 100644 --- a/tests/functional/measurements_concat.t +++ b/tests/functional/measurements_concat.t @@ -29,7 +29,9 @@ This is expected to fail. $ ${AUGUR} measurements concat \ > --jsons measurements_concat/single_collection_measurements_1.json measurements_concat/single_collection_measurements_1.json \ > --default-collection collection_1 \ - > --output-json "$TMP/multiple_collections_measurements.json" &>/dev/null + > --output-json "$TMP/multiple_collections_measurements.json" 1>/dev/null + ERROR: Collections at indexes [0, 1] share the same collection key 'collection_1'. + ERROR: Validation of output JSON failed. See detailed errors above. [1] Measurements concat with an invalid default collection. @@ -38,5 +40,7 @@ This is expected to fail. $ ${AUGUR} measurements concat \ > --jsons measurements_concat/single_collection_measurements_1.json measurements_concat/single_collection_measurements_2.json \ > --default-collection collection_3 \ - > --output-json "$TMP/multiple_collections_measurements.json" &>/dev/null + > --output-json "$TMP/multiple_collections_measurements.json" 1>/dev/null + ERROR: The default collection key 'collection_3' does not match any of the collections' keys. + ERROR: Validation of output JSON failed. See detailed errors above. [1] From d8272f4a403f5c4bcd47e09d214cc1c7cb1bb042 Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 May 2022 14:49:54 -0700 Subject: [PATCH 15/21] measurements: remove function `column_exists` Suggested by @huddlej in review. The error printed within the function seems like a side effect. The code would be clearer if the error printing is handled directly with the boolean check. --- augur/measurements.py | 44 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index 8b5406326..67037ea65 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -21,34 +21,6 @@ } -def column_exists(collection, column, column_purpose): - """ - Checks the provided column exists in the provided collection. - Prints an error message to stderr if the column does not exist. - - Parameters - ---------- - collection: pandas.DataFrame - Collection of measurements and metadata - column: str - Column to check exists in the collection - column_purpose: str - Purpose of provided column for detailed error message - - Returns - ------- - bool - True if column exists in collection - """ - column_in_df = column in collection.columns - if not column_in_df: - print( - f"ERROR: Provided {column_purpose} column {column!r} does not exist in collection TSV.", - file=sys.stderr, - ) - return column_in_df - - def load_collection(collection, strain_column, value_column): """ Loads the provided collection TSV as a pandas DataFrame. @@ -98,7 +70,11 @@ def load_collection(collection, strain_column, value_column): checks_passed = True for provided_column, required_column in required_column_map.items(): # Confirm the provided column exists - if not column_exists(collection_df, provided_column, required_column): + if provided_column not in collection_df.columns: + print( + f"ERROR: Provided {required_column} column {provided_column!r} does not exist in collection TSV.", + file=sys.stderr, + ) checks_passed = False # Confirm the provided column does not overwrite an existing column if (required_column in collection_df.columns and @@ -143,7 +119,15 @@ def get_collection_groupings(collection, grouping_columns): list[dict] or None The groupings for the collection config or None if all grouping columns are invalid """ - groupings = [{'key': col} for col in grouping_columns if column_exists(collection, col, "grouping")] + groupings = [] + for column in grouping_columns: + if column in collection.columns: + groupings.append({'key': column}) + else: + print( + f"ERROR: Provided grouping column {column!r} does not exist in collection TSV.", + file=sys.stderr, + ) if not groupings: print("ERROR: Provided grouping columns were invalid for provided collection.", file=sys.stderr) From 3796b4ac4c212776f0e8ad8bf098c7e36a66bd9e Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 May 2022 15:00:23 -0700 Subject: [PATCH 16/21] measurements export: exit with error if any groupings invalid If any of the groupings provided by the user does not exist as a column, then exit command with an error message to ensure that these are not ignored by the user. If any defined grouping does not exist, then it might indicate a greater error such as passing the wrong collection TSV. --- augur/measurements.py | 11 ++++------- tests/functional/measurements_export.t | 11 +++++++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index 67037ea65..f4d048a76 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -117,21 +117,18 @@ def get_collection_groupings(collection, grouping_columns): Returns ------- list[dict] or None - The groupings for the collection config or None if all grouping columns are invalid + The groupings for the collection config or None any grouping columns are invalid """ groupings = [] for column in grouping_columns: - if column in collection.columns: - groupings.append({'key': column}) - else: + if column not in collection.columns: print( f"ERROR: Provided grouping column {column!r} does not exist in collection TSV.", file=sys.stderr, ) + return None - if not groupings: - print("ERROR: Provided grouping columns were invalid for provided collection.", file=sys.stderr) - return None + groupings.append({'key': column}) return groupings diff --git a/tests/functional/measurements_export.t b/tests/functional/measurements_export.t index 8be8b0fa9..95be95f06 100644 --- a/tests/functional/measurements_export.t +++ b/tests/functional/measurements_export.t @@ -53,6 +53,17 @@ This is expected to fail. ERROR: Loading of collection TSV was unsuccessful. See detailed errors above. [1] +Try measurements export with invalid grouping columns. +This is expected to fail. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection.tsv \ + > --grouping-column bad_field \ + > --output-json "$TMP/minimal_measurements.json" + ERROR: Provided grouping column 'bad_field' does not exist in collection TSV. + ERROR: Cannot create measurements JSON without valid groupings + [1] + Measurements export for a single collection using only command line configs. $ ${AUGUR} measurements export \ From dd841fc4ee36370ab42cea85557e44b66372033d Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 May 2022 15:47:29 -0700 Subject: [PATCH 17/21] measurements: rename `validate_*` as `read_*` Makes it clearer that these functions will read and return the file contents. Similar to other `io` functions in Augur, the validation of the file structure is an expected side effect. --- augur/measurements.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index f4d048a76..4a516692b 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -7,8 +7,8 @@ from .utils import write_json, HideAsFalseAction from .validate import ( - measurements as validate_measurements_json, - measurements_collection_config as validate_collection_config_json, + measurements as read_measurements_json, + measurements_collection_config as read_collection_config_json, ValidateError ) @@ -169,7 +169,7 @@ def validate_output_json(output_json): """ print("Validating produced measurements JSON") try: - validate_measurements_json(measurements_json=output_json) + read_measurements_json(measurements_json=output_json) except ValidateError: print( "ERROR: Validation of output JSON failed. See detailed errors above.", @@ -189,7 +189,7 @@ def export_measurements(args): collection_config = {} if args.get('collection_config'): try: - collection_config = validate_collection_config_json(args['collection_config']) + collection_config = read_collection_config_json(args['collection_config']) except ValidateError: print( f"Validation of provided collection config JSON {args['collection_config']!r} failed.", @@ -246,7 +246,7 @@ def concat_measurements(args): output['default_collection'] = args['default_collection'] for json in args['jsons']: - measurements = validate_measurements_json(json) + measurements = read_measurements_json(json) output['collections'].extend(measurements['collections']) indent = {"indent": None} if args['minify_json'] else {} From 8fe026a7d5642f7d82764143fb8c424d6534ad94 Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 May 2022 16:11:44 -0700 Subject: [PATCH 18/21] measurements: add argument groups to concat Renamed argument group variables for export as well to differentiate the argument groups for each subcommand. --- augur/measurements.py | 56 ++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index 4a516692b..923254fb1 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -260,83 +260,91 @@ def register_arguments(parser): export = subparsers.add_parser("export", help="Export a measurements JSON for a single collection") - required = export.add_argument_group( + export_required = export.add_argument_group( title="REQUIRED" ) - required.add_argument("--collection", required=True, metavar="TSV", + export_required.add_argument("--collection", required=True, metavar="TSV", help="Collection of measurements and metadata in a TSV file. " + "Keep in mind duplicate columns will be renamed as 'X', 'X.1', 'X.2'...'X.N'") - required.add_argument("--strain-column", default="strain", + export_required.add_argument("--strain-column", default="strain", help="Name of the column containing strain names. " + "Provided column will be renamed to `strain` so please make sure no other columns are named `strain`. " + "Strain names in this column should match the strain names in the corresponding Auspice dataset JSON. " + "(default: %(default)s)") - required.add_argument("--value-column", default="value", + export_required.add_argument("--value-column", default="value", help="Name of the column containing the numeric values to be plotted for the given collection. " + "Provided column will be renamed to `value` so please make sure no other columns are named `value`. " + "(default: %(default)s)") - required.add_argument("--output-json", required=True, metavar="JSON", type=str, + export_required.add_argument("--output-json", required=True, metavar="JSON", type=str, help="Output JSON file. " + "The file name must follow the Auspice sidecar file naming convention to be recognized as a sidecar file. " + "See Nextstrain data format docs for more details.") - config = export.add_argument_group( + export_config = export.add_argument_group( title="COLLECTION CONFIGURATION", description="These options control the configuration of the collection for Auspice. " + "You can provide a config JSON (which includes all available options) or " + "command line arguments (which are more limited). " + "Command line arguments will override the values set in the config JSON." ) - config.add_argument("--collection-config", metavar="JSON", + export_config.add_argument("--collection-config", metavar="JSON", help="Collection configuration file for advanced configurations. ") - config.add_argument("--grouping-column", nargs="+", + export_config.add_argument("--grouping-column", nargs="+", help="Name of the column(s) that should be used as grouping(s) for measurements. " + "Note that if groupings are provided via command line args, the default group-by " + "field in the config JSON will be dropped.") - config.add_argument("--key", + export_config.add_argument("--key", help="A short key name of the collection for internal use within Auspice. " + "If not provided via config or command line option, the collection TSV filename will be used. ") - config.add_argument("--title", + export_config.add_argument("--title", help="The full title of the collection to display in the measurements panel title. " + f"If not provided via config or command line option, the panel's default title is {DEFAULT_ARGS['title']!r}.") - config.add_argument("--x-axis-label", + export_config.add_argument("--x-axis-label", help="The short label to display for the x-axis that describles the value of the measurements. " + "If not provided via config or command line option, the panel's default " + f"x-axis label is {DEFAULT_ARGS['x_axis_label']!r}.") - config.add_argument("--threshold", type=float, + export_config.add_argument("--threshold", type=float, help="A measurements value threshold to be displayed in the measurements panel.") - config.add_argument("--filters", nargs="+", + export_config.add_argument("--filters", nargs="+", help="The columns that are to be used a filters for measurements. " + "If not provided, all columns will be available as filters.") - config.add_argument("--group-by", type=str, + export_config.add_argument("--group-by", type=str, help="The default grouping column. If not provided, the first grouping will be used.") - config.add_argument("--measurements-display", type=str, choices=["raw", "mean"], + export_config.add_argument("--measurements-display", type=str, choices=["raw", "mean"], help="The default display of the measurements") - config.add_argument("--show-overall-mean", "--hide-overall-mean", + export_config.add_argument("--show-overall-mean", "--hide-overall-mean", dest="show_overall_mean", action=HideAsFalseAction, nargs=0, help="Show or hide the overall mean per group by default") - config.add_argument("--show-threshold", "--hide-threshold", + export_config.add_argument("--show-threshold", "--hide-threshold", dest="show_threshold", action=HideAsFalseAction, nargs=0, help="Show or hide the threshold by default. This will be ignored if no threshold is provided.") - optional_settings = export.add_argument_group( + export_optional = export.add_argument_group( title="OPTIONAL SETTINGS" ) - optional_settings.add_argument("--minify-json", action="store_true", + export_optional.add_argument("--minify-json", action="store_true", help="Export JSON without indentation or line returns.") concat = subparsers.add_parser("concat", help="Concatenate multiple measurements JSONs into a single JSON file") - concat.add_argument("--jsons", required=True, type=str, nargs="+", metavar="JSONs", + concat_required = concat.add_argument_group( + title="REQUIRED" + ) + concat_required.add_argument("--jsons", required=True, type=str, nargs="+", metavar="JSONs", help="Measurement JSON files to concatenate.") - concat.add_argument("--default-collection", type=str, + concat_required.add_argument("--output-json", required=True, metavar="JSON", type=str, + help="Output JSON file") + + concat_optional = concat.add_argument_group( + title="OPTIONAL SETTINGS" + ) + concat_optional.add_argument("--default-collection", type=str, help="The key of the default collection to display. " + "If not provided, the first collection of the first JSON file will be displayed") - concat.add_argument("--minify-json", action="store_true", + concat_optional.add_argument("--minify-json", action="store_true", help="Concatenate JSONs without indentation or line returns.") - concat.add_argument("--output-json", required=True, metavar="JSON", type=str, - help="Output JSON file") + def run(args): From cbace7c1a00c1bea2a763d80ccad26536f988fbd Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 May 2022 16:26:27 -0700 Subject: [PATCH 19/21] measurements: use argparse.Namespace object Follow the pattern used by most Augur subcommands, which uses the argparse.Namespace object in their "main" logic --- augur/measurements.py | 50 ++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index 923254fb1..e1caee65e 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -141,20 +141,22 @@ def override_config_with_args(config, args): ---------- config: dict A collection config - args: dict - The __dict__ attribute of the parsed arguments from argparse + args: argparse.Namespace + Command line arguments provided by the user. """ config_key_args = ['key', 'title', 'filters', 'x_axis_label', 'threshold'] display_default_args = ['group_by', 'measurements_display', 'show_overall_mean', 'show_threshold'] for key_arg in config_key_args: - if args.get(key_arg) is not None: - config[key_arg] = args[key_arg] + key_arg_value = getattr(args, key_arg) + if key_arg_value is not None: + config[key_arg] = key_arg_value for default_arg in display_default_args: - if args.get(default_arg) is not None: + default_arg_value = getattr(args, default_arg) + if default_arg_value is not None: config['display_defaults'] = config.get('display_defaults', {}) - config['display_defaults'][default_arg] = args[default_arg] + config['display_defaults'][default_arg] = default_arg_value def validate_output_json(output_json): @@ -180,27 +182,27 @@ def validate_output_json(output_json): def export_measurements(args): # Load input collection TSV file - collection_df = load_collection(args['collection'], args['strain_column'], args['value_column']) + collection_df = load_collection(args.collection, args.strain_column, args.value_column) if collection_df is None: print("ERROR: Loading of collection TSV was unsuccessful. See detailed errors above.", file=sys.stderr) sys.exit(1) collection_config = {} - if args.get('collection_config'): + if args.collection_config is not None: try: - collection_config = read_collection_config_json(args['collection_config']) + collection_config = read_collection_config_json(args.collection_config) except ValidateError: print( - f"Validation of provided collection config JSON {args['collection_config']!r} failed.", + f"Validation of provided collection config JSON {args.collection_config!r} failed.", "Please check the formatting of this file.", file=sys.stderr ) sys.exit(1) groupings = collection_config.pop('groupings', None) - if args.get('grouping_column'): - groupings = get_collection_groupings(collection_df, args['grouping_column']) + if args.grouping_column is not None: + groupings = get_collection_groupings(collection_df, args.grouping_column) if collection_config.get('display_defaults', {}).pop('group_by', None): print( "WARNING: The default group-by in the collection config has been removed", @@ -217,7 +219,7 @@ def export_measurements(args): # Create collection output object with default values for required keys collection_output = { - 'key': collection_config.pop('key', os.path.basename(args['collection'])), + 'key': collection_config.pop('key', os.path.basename(args.collection)), 'title': collection_config.pop('title', DEFAULT_ARGS['title']), 'groupings': groupings, 'x_axis_label': collection_config.pop('x_axis_label', DEFAULT_ARGS['x_axis_label']), @@ -231,27 +233,27 @@ def export_measurements(args): } # Set indentation to None to create compact JSON if specified - indent = {"indent": None} if args['minify_json'] else {} + indent = {"indent": None} if args.minify_json else {} # Create output JSON - write_json(output, args['output_json'], include_version=False, **indent) + write_json(output, args.output_json, include_version=False, **indent) # Verify the produced output is a valid measurements JSON - validate_output_json(args['output_json']) + validate_output_json(args.output_json) def concat_measurements(args): output = { 'collections': [] } - if args.get("default_collection"): - output['default_collection'] = args['default_collection'] + if args.default_collection is not None: + output['default_collection'] = args.default_collection - for json in args['jsons']: + for json in args.jsons: measurements = read_measurements_json(json) output['collections'].extend(measurements['collections']) - indent = {"indent": None} if args['minify_json'] else {} - write_json(output, args['output_json'], include_version=False, **indent) - validate_output_json(args['output_json']) + indent = {"indent": None} if args.minify_json else {} + write_json(output, args.output_json, include_version=False, **indent) + validate_output_json(args.output_json) def register_arguments(parser): @@ -349,6 +351,6 @@ def register_arguments(parser): def run(args): if args.subcommand == 'export': - return export_measurements(vars(args)) + return export_measurements(args) if args.subcommand == "concat": - return concat_measurements(vars(args)) + return concat_measurements(args) From b510e7a001029eaab4ee1a0277050e1aa519b662 Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 May 2022 17:39:35 -0700 Subject: [PATCH 20/21] measurements export: add `--include-columns` option Allows users to specify a list of columns from their collection TSV to include in the output measurements JSON. All columns will be included by default if option is not provided. The 'strain' and 'value' columns do not have to be included in the list since these are required columns. However, other configuration columns (e.g. groupings) will need to be explicitly included in the list if users are using the option. This ensures that we do not give unexpected outputs by auto-including grouping columns. Includes new functional test for the new option. --- augur/measurements.py | 39 ++++++++++++++++--- tests/functional/measurements_export.t | 27 +++++++++++++ .../minimal_measurements_subset.json | 34 ++++++++++++++++ 3 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 tests/functional/measurements_export/minimal_measurements_subset.json diff --git a/augur/measurements.py b/augur/measurements.py index e1caee65e..22e688bdd 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -21,12 +21,15 @@ } -def load_collection(collection, strain_column, value_column): +def load_collection(collection, strain_column, value_column, include_columns): """ Loads the provided collection TSV as a pandas DataFrame. Renames the provided strain and value columns if needed and ensures the value column has a numeric dtype. + Loads all columns by default. If columns are provided via *include_columns* + then only use listed columns (plus *strain_column* and *value_column*). + Prints any error messages to stderr. Parameters @@ -37,14 +40,21 @@ def load_collection(collection, strain_column, value_column): The name of the strain column within the collection TSV value_column: str The name of the value column within the collection TSV + include_columns: list[str] + List of columns to include in the collections DataFrame Returns ------- pandas.DataFrame or None The collection DataFrame or None if any errors were encountered during loading """ + # Default value to None so all columns will be read + columns_to_include = None + if include_columns: + columns_to_include = set([strain_column, value_column] + include_columns) + try: - collection_df = pd.read_csv(collection, sep="\t") + collection_df = pd.read_csv(collection, sep="\t", usecols=columns_to_include) except FileNotFoundError: print( f"ERROR: collection TSV file {collection!r} does not exist", @@ -102,15 +112,18 @@ def load_collection(collection, strain_column, value_column): return collection_df -def get_collection_groupings(collection, grouping_columns): +def get_collection_groupings(collection, include_columns, grouping_columns): """ Creates the groupings for the provided collection using the provided - grouping columns after verifying the columns exist in the collection. + grouping columns after verifying the columns exist the user provided + include_columns and in the collection. Parameters ---------- collection: pandas.DataFrame The collection used to validate groupings + include_columns: list[str] + List of user provided columns to include in collection grouping_columns: list[str] List of grouping column names @@ -121,6 +134,16 @@ def get_collection_groupings(collection, grouping_columns): """ groupings = [] for column in grouping_columns: + # If the user specified columns to include, verify the grouping column was included + if include_columns and column not in include_columns: + print( + f"ERROR: Provided grouping column {column!r} was not in the", + f"list of columns to include: {include_columns}.", + file=sys.stderr, + ) + return None + + # Verify the grouping column is included in the collection if column not in collection.columns: print( f"ERROR: Provided grouping column {column!r} does not exist in collection TSV.", @@ -182,7 +205,7 @@ def validate_output_json(output_json): def export_measurements(args): # Load input collection TSV file - collection_df = load_collection(args.collection, args.strain_column, args.value_column) + collection_df = load_collection(args.collection, args.strain_column, args.value_column, args.include_columns) if collection_df is None: print("ERROR: Loading of collection TSV was unsuccessful. See detailed errors above.", file=sys.stderr) @@ -202,7 +225,7 @@ def export_measurements(args): groupings = collection_config.pop('groupings', None) if args.grouping_column is not None: - groupings = get_collection_groupings(collection_df, args.grouping_column) + groupings = get_collection_groupings(collection_df, args.include_columns, args.grouping_column) if collection_config.get('display_defaults', {}).pop('group_by', None): print( "WARNING: The default group-by in the collection config has been removed", @@ -325,6 +348,10 @@ def register_arguments(parser): export_optional = export.add_argument_group( title="OPTIONAL SETTINGS" ) + export_optional.add_argument("--include-columns", nargs="+", + help="The columns to include from the collection TSV in the measurements JSON. " + + "Be sure to list columns that are used as groupings and/or filters. " + + "If no columns are provided, then all columns will be included by default.") export_optional.add_argument("--minify-json", action="store_true", help="Export JSON without indentation or line returns.") diff --git a/tests/functional/measurements_export.t b/tests/functional/measurements_export.t index 95be95f06..1f31f2042 100644 --- a/tests/functional/measurements_export.t +++ b/tests/functional/measurements_export.t @@ -53,6 +53,33 @@ This is expected to fail. ERROR: Loading of collection TSV was unsuccessful. See detailed errors above. [1] +Minimal measurements export with user provided strain, value, and subset of columns. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection_without_strain_value_columns.tsv \ + > --strain-column strain_field \ + > --value-column value_field \ + > --grouping-column field_1 \ + > --include-columns field_1 field_3 \ + > --output-json "$TMP/minimal_measurements_subset.json" &>/dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" measurements_export/minimal_measurements_subset.json "$TMP/minimal_measurements_subset.json" + {} + +Try measurements export with grouping column missing from include columns list +This is expected to fail. + + $ ${AUGUR} measurements export \ + > --collection measurements_export/collection_without_strain_value_columns.tsv \ + > --strain-column strain_field \ + > --value-column value_field \ + > --grouping-column field_1 \ + > --include-columns field_3 \ + > --output-json "$TMP/minimal_measurements_subset.json" 1>/dev/null + ERROR: Provided grouping column 'field_1' was not in the list of columns to include: ['field_3']. + ERROR: Cannot create measurements JSON without valid groupings + [1] + Try measurements export with invalid grouping columns. This is expected to fail. diff --git a/tests/functional/measurements_export/minimal_measurements_subset.json b/tests/functional/measurements_export/minimal_measurements_subset.json new file mode 100644 index 000000000..4cbbef55a --- /dev/null +++ b/tests/functional/measurements_export/minimal_measurements_subset.json @@ -0,0 +1,34 @@ +{ + "collections": [ + { + "groupings": [ + { + "key": "field_1" + } + ], + "key": "collection_without_strain_value_columns.tsv", + "measurements": [ + { + "field_1": "value_1", + "field_3": "value_1", + "strain": "strain_1", + "value": 1.0 + }, + { + "field_1": "value_2", + "field_3": "value_2", + "strain": "strain_2", + "value": 2.0 + }, + { + "field_1": "value_3", + "field_3": "value_3", + "strain": "strain_3", + "value": 3.0 + } + ], + "title": "Measurements", + "x_axis_label": "measurement values" + } + ] +} \ No newline at end of file From 8fbdc823a4d1158e2923b21984fdad3d2f837e1e Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 May 2022 18:30:46 -0700 Subject: [PATCH 21/21] measurements: overhaul to single "main" function The measurements sub-subcommands are very straightforward that is seems to add unnecessary complexity by breaking out into functions that are mainly used to raise errors that then have to be caught by the "main" function. --- augur/measurements.py | 207 ++++++++----------------- tests/functional/measurements_export.t | 4 - 2 files changed, 63 insertions(+), 148 deletions(-) diff --git a/augur/measurements.py b/augur/measurements.py index 22e688bdd..9cc78f1ad 100644 --- a/augur/measurements.py +++ b/augur/measurements.py @@ -21,59 +21,34 @@ } -def load_collection(collection, strain_column, value_column, include_columns): - """ - Loads the provided collection TSV as a pandas DataFrame. - Renames the provided strain and value columns if needed and ensures the - value column has a numeric dtype. - - Loads all columns by default. If columns are provided via *include_columns* - then only use listed columns (plus *strain_column* and *value_column*). - - Prints any error messages to stderr. - - Parameters - ---------- - collection: str - Filepath to the collection TSV file - strain_column: str - The name of the strain column within the collection TSV - value_column: str - The name of the value column within the collection TSV - include_columns: list[str] - List of columns to include in the collections DataFrame - - Returns - ------- - pandas.DataFrame or None - The collection DataFrame or None if any errors were encountered during loading - """ +def export_measurements(args): # Default value to None so all columns will be read columns_to_include = None - if include_columns: - columns_to_include = set([strain_column, value_column] + include_columns) + if args.include_columns is not None: + columns_to_include = set([args.strain_column, args.value_column] + args.include_columns) + # Load input collection TSV file try: - collection_df = pd.read_csv(collection, sep="\t", usecols=columns_to_include) + collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include) except FileNotFoundError: print( - f"ERROR: collection TSV file {collection!r} does not exist", + f"ERROR: collection TSV file {args.collection!r} does not exist", file=sys.stderr, ) - return None + sys.exit(1) # Verify the strain and value columns are different - if strain_column == value_column: + if args.strain_column == args.value_column: print( "ERROR: The strain column and value column cannot be the same column.", file=sys.stderr ) - return None + sys.exit(1) - # Define mapping of requried columns to user provided columns + # Define mapping of required columns to user provided columns required_column_map = { - strain_column: 'strain', - value_column: 'value', + args.strain_column: 'strain', + args.value_column: 'value', } # Check all required columns are included in collection TSV @@ -97,7 +72,7 @@ def load_collection(collection, strain_column, value_column, include_columns): checks_passed = False if not checks_passed: - return None + sys.exit(1) # Rename user provided columns to expected columns collection_df = collection_df.rename(columns=required_column_map) @@ -107,108 +82,6 @@ def load_collection(collection, strain_column, value_column, include_columns): collection_df['value'] = pd.to_numeric(collection_df['value']) except ValueError as e: print(f"ERROR: Found a non-numeric measurement value: {e!r}", file=sys.stderr) - return None - - return collection_df - - -def get_collection_groupings(collection, include_columns, grouping_columns): - """ - Creates the groupings for the provided collection using the provided - grouping columns after verifying the columns exist the user provided - include_columns and in the collection. - - Parameters - ---------- - collection: pandas.DataFrame - The collection used to validate groupings - include_columns: list[str] - List of user provided columns to include in collection - grouping_columns: list[str] - List of grouping column names - - Returns - ------- - list[dict] or None - The groupings for the collection config or None any grouping columns are invalid - """ - groupings = [] - for column in grouping_columns: - # If the user specified columns to include, verify the grouping column was included - if include_columns and column not in include_columns: - print( - f"ERROR: Provided grouping column {column!r} was not in the", - f"list of columns to include: {include_columns}.", - file=sys.stderr, - ) - return None - - # Verify the grouping column is included in the collection - if column not in collection.columns: - print( - f"ERROR: Provided grouping column {column!r} does not exist in collection TSV.", - file=sys.stderr, - ) - return None - - groupings.append({'key': column}) - - return groupings - - -def override_config_with_args(config, args): - """ - Overrides values in the config with values of provided command line args. - - Parameters - ---------- - config: dict - A collection config - args: argparse.Namespace - Command line arguments provided by the user. - """ - config_key_args = ['key', 'title', 'filters', 'x_axis_label', 'threshold'] - display_default_args = ['group_by', 'measurements_display', 'show_overall_mean', 'show_threshold'] - - for key_arg in config_key_args: - key_arg_value = getattr(args, key_arg) - if key_arg_value is not None: - config[key_arg] = key_arg_value - - for default_arg in display_default_args: - default_arg_value = getattr(args, default_arg) - if default_arg_value is not None: - config['display_defaults'] = config.get('display_defaults', {}) - config['display_defaults'][default_arg] = default_arg_value - - -def validate_output_json(output_json): - """ - Validate the output JSON against the measurements schema - - Parameters - ---------- - output_json: str - Filepath to output JSON - - """ - print("Validating produced measurements JSON") - try: - read_measurements_json(measurements_json=output_json) - except ValidateError: - print( - "ERROR: Validation of output JSON failed. See detailed errors above.", - file=sys.stderr, - ) - sys.exit(1) - - -def export_measurements(args): - # Load input collection TSV file - collection_df = load_collection(args.collection, args.strain_column, args.value_column, args.include_columns) - - if collection_df is None: - print("ERROR: Loading of collection TSV was unsuccessful. See detailed errors above.", file=sys.stderr) sys.exit(1) collection_config = {} @@ -225,7 +98,27 @@ def export_measurements(args): groupings = collection_config.pop('groupings', None) if args.grouping_column is not None: - groupings = get_collection_groupings(collection_df, args.include_columns, args.grouping_column) + groupings = [] + for column in args.grouping_column: + # If the user specified columns to include, verify the grouping column was included + if args.include_columns and column not in args.include_columns: + print( + f"ERROR: Provided grouping column {column!r} was not in the", + f"list of columns to include: {args.include_columns}.", + file=sys.stderr, + ) + sys.exit(1) + + # Verify the grouping column is included in the collection + if column not in collection_df.columns: + print( + f"ERROR: Provided grouping column {column!r} does not exist in collection TSV.", + file=sys.stderr, + ) + sys.exit(1) + + groupings.append({'key': column}) + if collection_config.get('display_defaults', {}).pop('group_by', None): print( "WARNING: The default group-by in the collection config has been removed", @@ -238,7 +131,19 @@ def export_measurements(args): sys.exit(1) # Combine collection config with command line args - override_config_with_args(collection_config, args) + config_key_args = ['key', 'title', 'filters', 'x_axis_label', 'threshold'] + display_default_args = ['group_by', 'measurements_display', 'show_overall_mean', 'show_threshold'] + + for key_arg in config_key_args: + key_arg_value = getattr(args, key_arg) + if key_arg_value is not None: + collection_config[key_arg] = key_arg_value + + for default_arg in display_default_args: + default_arg_value = getattr(args, default_arg) + if default_arg_value is not None: + collection_config['display_defaults'] = collection_config.get('display_defaults', {}) + collection_config['display_defaults'][default_arg] = default_arg_value # Create collection output object with default values for required keys collection_output = { @@ -260,7 +165,14 @@ def export_measurements(args): # Create output JSON write_json(output, args.output_json, include_version=False, **indent) # Verify the produced output is a valid measurements JSON - validate_output_json(args.output_json) + try: + read_measurements_json(measurements_json=args.output_json) + except ValidateError: + print( + "ERROR: Validation of output JSON failed. See detailed errors above.", + file=sys.stderr, + ) + sys.exit(1) def concat_measurements(args): @@ -276,7 +188,14 @@ def concat_measurements(args): indent = {"indent": None} if args.minify_json else {} write_json(output, args.output_json, include_version=False, **indent) - validate_output_json(args.output_json) + try: + read_measurements_json(measurements_json=args.output_json) + except ValidateError: + print( + "ERROR: Validation of output JSON failed. See detailed errors above.", + file=sys.stderr, + ) + sys.exit(1) def register_arguments(parser): diff --git a/tests/functional/measurements_export.t b/tests/functional/measurements_export.t index 1f31f2042..d36b67383 100644 --- a/tests/functional/measurements_export.t +++ b/tests/functional/measurements_export.t @@ -37,7 +37,6 @@ This is expected to fail. > --output-json "$TMP/minimal_measurements.json" ERROR: Cannot use provided 'field_1' column as the strain column because a 'strain' column already exists in collection TSV. ERROR: Cannot use provided 'field_2' column as the value column because a 'value' column already exists in collection TSV. - ERROR: Loading of collection TSV was unsuccessful. See detailed errors above. [1] Try measurements export with user provided strain and value columns that are the same column. @@ -50,7 +49,6 @@ This is expected to fail. > --grouping-column field_1 \ > --output-json "$TMP/minimal_measurements.json" ERROR: The strain column and value column cannot be the same column. - ERROR: Loading of collection TSV was unsuccessful. See detailed errors above. [1] Minimal measurements export with user provided strain, value, and subset of columns. @@ -77,7 +75,6 @@ This is expected to fail. > --include-columns field_3 \ > --output-json "$TMP/minimal_measurements_subset.json" 1>/dev/null ERROR: Provided grouping column 'field_1' was not in the list of columns to include: ['field_3']. - ERROR: Cannot create measurements JSON without valid groupings [1] Try measurements export with invalid grouping columns. @@ -88,7 +85,6 @@ This is expected to fail. > --grouping-column bad_field \ > --output-json "$TMP/minimal_measurements.json" ERROR: Provided grouping column 'bad_field' does not exist in collection TSV. - ERROR: Cannot create measurements JSON without valid groupings [1] Measurements export for a single collection using only command line configs.