Merge branch 'master' into victorlin/travis-to-gh

nextstrain · Dec 29, 2021 · 92132b5 · 92132b5
2 parents b05ce80 + 6585427
commit 92132b5
Show file tree

Hide file tree

Showing 15 changed files with 426 additions and 101 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -23,7 +23,8 @@ jobs:
         channels: conda-forge,bioconda
         channel-priority: true
         activate-environment: test
-    - run: mamba install mafft raxml fasttree iqtree vcftools pip
+    - run: mamba install mafft raxml fasttree iqtree vcftools pip numpy
+    - run: pip install biopython==1.67
     - run: pip install -e .[dev]
     - run: conda info
     - run: conda list

diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,46 @@
+version: ~> 1.0
+language: generic
+
+# See <https://docs.travis-ci.com/user/build-stages/> for more information on
+# how build stages work.
+stages:
+  - test
+
+  # See <https://docs.travis-ci.com/user/conditions-v1> for more on the "if" syntax.
+  - name: deploy
+    if: branch = release and type != pull_request
+
+jobs:
+  include:
+    - &test
+      stage: test
+      language: python
+      python: 3.6
+      before_install:
+        - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+        - bash miniconda.sh -b -p $HOME/miniconda
+        - export PATH="$HOME/miniconda/bin:$PATH"
+        - hash -r
+        - conda config --set always_yes yes --set changeps1 no
+        - conda update -q conda
+        - conda info -a
+        - conda create -n augur -c bioconda python=$TRAVIS_PYTHON_VERSION mafft raxml fasttree iqtree vcftools pip numpy
+        - source activate augur
+        - pip install biopython==1.67
+      install:
+        - pip install -e .[dev]
+      script:
+        - (pytest -c pytest.python3.ini  --cov-report= --cov=augur)
+        - (cram --shell=/bin/bash tests/functional/*.t tests/builds/*.t)
+        - (bash tests/builds/runner.sh)
+      after_success:
+        # upload to codecov
+        - bash <(curl -s https://codecov.io/bash) -f "!*.gcov" -X gcov -e TRAVIS_PYTHON_VERSION -y ci/codecov.yml|| echo "Codecov did not collect coverage reports"
+
+    - <<: *test
+      python: 3.7
+    - <<: *test
+      python: 3.8
+
+    - stage: deploy
+      script: ./devel/travis-rebuild-docker-image
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,6 +3,36 @@
 ## __NEXT__
 
 
+## 13.1.0 (10 December 2021)
+
+### Features
+
+* schemas: Add "$id" key to Auspice config schemas so we have a way of referring to these. [#806][] (@tsibley)
+
+### Bug Fixes
+
+* filter: Fix groupby with incomplete dates. [#808][] (@victorlin)
+
+[#806]: https://github.com/nextstrain/augur/pull/806
+[#808]: https://github.com/nextstrain/augur/pull/808
+
+## 13.0.4 (8 December 2021)
+
+### Bug Fixes
+
+* dependencies: Replace deprecated mutable sequence interface for BioPython. [#788][] (@Carlosbogo)
+* dependencies: Fix backward compatibility with BioPython. [#801][] (@huddlej)
+* data: Add latitude and longitude details for "Reunion". [#791][] (@corneliusroemer)
+* filter: Use pandas functions to determine subsample groups. [#794][] and [#797][] (@victorlin)
+* filter: Add clarity to help message and output of probabilistic sampling. [#792][] (@victorlin)
+
+[#788]: https://github.com/nextstrain/augur/pull/788
+[#791]: https://github.com/nextstrain/augur/pull/791
+[#792]: https://github.com/nextstrain/augur/pull/792
+[#794]: https://github.com/nextstrain/augur/pull/794
+[#797]: https://github.com/nextstrain/augur/pull/797
+[#801]: https://github.com/nextstrain/augur/pull/801
+
 ## 13.0.3 (19 November 2021)
 
 ### Bug Fixes

diff --git a/augur/__version__.py b/augur/__version__.py
@@ -1,4 +1,4 @@
-__version__ = '13.0.3'
+__version__ = '13.1.0'
 
 
 def is_augur_version_compatible(version):

diff --git a/augur/data/schema-auspice-config-v2.json b/augur/data/schema-auspice-config-v2.json
@@ -1,7 +1,7 @@
 {
-    "type" : "object",
-    "version": "v2",
     "$schema": "http://json-schema.org/draft-06/schema#",
+    "$id": "https://nextstrain.org/schemas/auspice/config/v2",
+    "type": "object",
     "title": "Auspice config file to be supplied to `augur export v2`",
     "$comment": "This schema includes deprecated-but-handled-by-augur-export-v1 properties, but their schema definitions are somewhat incomplete",
     "additionalProperties": false,

diff --git a/augur/data/schema-export-v1-meta.json b/augur/data/schema-export-v1-meta.json
@@ -1,7 +1,7 @@
 {
-    "type" : "object",
     "$schema": "http://json-schema.org/draft-06/schema#",
-    "version": "0.1",
+    "$id": "https://nextstrain.org/schemas/dataset/v1/meta",
+    "type": "object",
     "title": "Nextstrain minimal metadata JSON schema",
     "description": "This is the validation schema for the augur produced metadata JSON, for consumption in Auspice. Note that every field is optional, but excluding fields may disable certain features in Auspice.",
     "additionalProperties": true,

diff --git a/augur/data/schema-export-v1-tree.json b/augur/data/schema-export-v1-tree.json
@@ -1,6 +1,7 @@
 {
-    "type" : "object",
     "$schema": "http://json-schema.org/draft-06/schema#",
+    "$id": "https://nextstrain.org/schemas/dataset/v1/tree",
+    "type": "object",
     "title": "Nextstrain tree JSON schema",
     "additionalProperties": false,
     "required": ["attr", "strain"],

diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json
@@ -1,7 +1,7 @@
 {
-    "type" : "object",
     "$schema": "http://json-schema.org/draft-06/schema#",
-    "version": "2.0",
+    "$id": "https://nextstrain.org/schemas/dataset/v2",
+    "type": "object",
     "title": "Nextstrain metadata JSON schema proposal (meta + tree together)",
     "additionalProperties": false,
     "required": ["version", "meta", "tree"],

diff --git a/augur/filter.py b/augur/filter.py
@@ -884,91 +884,76 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
     [{'strain': 'strain1', 'filter': 'skip_group_by_with_ambiguous_month', 'kwargs': ''}]
 
     """
-    if group_by:
-        groups = group_by
-    else:
-        groups = ("_dummy",)
-
+    metadata = metadata.loc[strains]
     group_by_strain = {}
     skipped_strains = []
-    for strain in strains:
-        skip_strain = False
-        group = []
-        m = metadata.loc[strain].to_dict()
-        # collect group specifiers
-        for c in groups:
-            if c == "_dummy":
-                group.append(c)
-            elif c in m:
-                group.append(m[c])
-            elif c in ['month', 'year'] and 'date' in m:
-                try:
-                    year = int(m["date"].split('-')[0])
-                except:
+
+    if metadata.empty:
+        return group_by_strain, skipped_strains
+
+    if not group_by or group_by == ('_dummy',):
+        group_by_strain = {strain: ('_dummy',) for strain in strains}
+        return group_by_strain, skipped_strains
+
+    group_by_set = set(group_by)
+
+    # If we could not find any requested categories, we cannot complete subsampling.
+    if 'date' not in metadata and group_by_set <= {'year', 'month'}:
+        raise FilterException(f"The specified group-by categories ({group_by}) were not found. No sequences-per-group sampling will be done. Note that using 'year' or 'year month' requires a column called 'date'.")
+    if not group_by_set & (set(metadata.columns) | {'year', 'month'}):
+        raise FilterException(f"The specified group-by categories ({group_by}) were not found. No sequences-per-group sampling will be done.")
+
+    # date requested
+    if 'year' in group_by_set or 'month' in group_by_set:
+        if 'date' not in metadata:
+            # set year/month/day = unknown
+            print(f"WARNING: A 'date' column could not be found to group-by year or month.", file=sys.stderr)
+            print(f"Filtering by group may behave differently than expected!", file=sys.stderr)
+            df_dates = pd.DataFrame({'year': 'unknown', 'month': 'unknown'}, index=metadata.index)
+            metadata = pd.concat([metadata, df_dates], axis=1)
+        else:
+            # replace date with year/month/day as nullable ints
+            date_cols = ['year', 'month', 'day']
+            df_dates = metadata['date'].str.split('-', n=2, expand=True)
+            df_dates = df_dates.set_axis(date_cols[:len(df_dates.columns)], axis=1)
+            missing_date_cols = set(date_cols) - set(df_dates.columns)
+            for col in missing_date_cols:
+                df_dates[col] = pd.NA
+            for col in date_cols:
+                df_dates[col] = pd.to_numeric(df_dates[col], errors='coerce').astype(pd.Int64Dtype())
+            metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1)
+            if 'year' in group_by_set:
+                # skip ambiguous years
+                df_skip = metadata[metadata['year'].isnull()]
+                metadata.dropna(subset=['year'], inplace=True)
+                for strain in df_skip.index:
                     skipped_strains.append({
                         "strain": strain,
                         "filter": "skip_group_by_with_ambiguous_year",
                         "kwargs": "",
                     })
-                    skip_strain = True
-                    break
-                if c=='month':
-                    try:
-                        month = int(m["date"].split('-')[1])
-                    except:
-                        skipped_strains.append({
-                            "strain": strain,
-                            "filter": "skip_group_by_with_ambiguous_month",
-                            "kwargs": "",
-                        })
-                        skip_strain = True
-                        break
-
-                    group.append((year, month))
-                else:
-                    group.append(year)
-            else:
-                group.append('unknown')
-
-        if not skip_strain:
-            group_by_strain[strain] = tuple(group)
-
-    # If we could not find any requested categories, we cannot complete subsampling.
-    distinct_groups = set(group_by_strain.values())
-    if len(distinct_groups) == 1 and ('unknown' in distinct_groups or ('unknown',) in distinct_groups):
-        error_message = f"The specified group-by categories ({groups}) were not found. No sequences-per-group sampling will be done."
-
-        if any(x in groups for x in ('year', 'month')):
-            error_message += " Note that using 'year' or 'year month' requires a column called 'date'."
-
-        # Raise an exception, since we cannot find the requested groups.
-        raise FilterException(error_message)
-
-    # Check to see if some categories are missing to warn the user
-    group_by = {
-        'date' if cat in ('year', 'month') else cat
-        for cat in groups
-    }
-    missing_cats = [cat for cat in group_by if cat not in metadata.columns.values and cat != "_dummy"]
-    if missing_cats:
-        error_message = []
-
-        if any(cat != 'date' for cat in missing_cats):
-            error_message.append(
-                "Some of the specified group-by categories couldn't be found: %s" % ", ".join([str(cat) for cat in missing_cats if cat != 'date'])
-            )
-
-        if any(cat == 'date' for cat in missing_cats):
-            error_message.append("A 'date' column could not be found to group-by year or month.")
-
-        error_message.append("Filtering by group may behave differently than expected!")
-
-        # Print a warning message, but allow grouping to continue.
-        print(
-            "WARNING: %s" % "\n".join(error_message),
-            file=sys.stderr,
-        )
-
+            if 'month' in group_by_set:
+                # skip ambiguous months
+                df_skip = metadata[metadata['month'].isnull()]
+                metadata.dropna(subset=['month'], inplace=True)
+                for strain in df_skip.index:
+                    skipped_strains.append({
+                        "strain": strain,
+                        "filter": "skip_group_by_with_ambiguous_month",
+                        "kwargs": "",
+                    })
+                # month = (year, month)
+                metadata['month'] = list(zip(metadata['year'], metadata['month']))
+            # TODO: support group by day
+
+    unknown_groups = group_by_set - set(metadata.columns)
+    if unknown_groups:
+        print(f"WARNING: Some of the specified group-by categories couldn't be found: {', '.join(unknown_groups)}", file=sys.stderr)
+        print("Filtering by group may behave differently than expected!", file=sys.stderr)
+        for group in unknown_groups:
+            metadata[group] = 'unknown'
+
+    group_by_strain = dict(zip(metadata.index, metadata[group_by].apply(tuple, axis=1)))
     return group_by_strain, skipped_strains
 
 
@@ -1143,7 +1128,7 @@ def register_arguments(parser):
     subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
     subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
     probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
-    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
+    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Allow probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
     probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
     subsample_group.add_argument('--priority', type=str, help="""tab-delimited file with list of priority scores for strains (e.g., "<strain>\\t<priority>") and no header.
     When scores are provided, Augur converts scores to floating point values, sorts strains within each subsampling group from highest to lowest priority, and selects the top N strains per group where N is the calculated or requested number of strains per group.
@@ -1494,16 +1479,20 @@ def run(args):
         # sequences requested, sequences per group will be a floating point
         # value and subsampling will be probabilistic.
         try:
-            sequences_per_group = calculate_sequences_per_group(
+            sequences_per_group, probabilistic_used = calculate_sequences_per_group(
                 args.subsample_max_sequences,
                 records_per_group.values(),
                 args.probabilistic_sampling,
             )
-            print(f"Sampling at {sequences_per_group} per group.")
         except TooManyGroupsError as error:
             print(f"ERROR: {error}", file=sys.stderr)
             sys.exit(1)
 
+        if (probabilistic_used):
+            print(f"Sampling probabilistically at {sequences_per_group:0.4f} sequences per group, meaning it is possible to have more than the requested maximum of {args.subsample_max_sequences} sequences after filtering.")
+        else:
+            print(f"Sampling at {sequences_per_group} per group.")
+
         if queues_by_group is None:
             # We know all of the possible groups now from the first pass through
             # the metadata, so we can create queues for all groups at once.
@@ -1711,7 +1700,7 @@ def numeric_date(date):
         return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
 
 
-def calculate_sequences_per_group(target_max_value, counts_per_group, probabilistic=True):
+def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True):
     """Calculate the number of sequences per group for a given maximum number of
     sequences to be returned and the number of sequences in each requested
     group. Optionally, allow the result to be probabilistic such that the mean
@@ -1725,37 +1714,43 @@ def calculate_sequences_per_group(target_max_value, counts_per_group, probabilis
         number of sequences per group for the given counts per group.
     counts_per_group : list[int]
         A list with the number of sequences in each requested group.
-    probabilistic : bool
+    allow_probabilistic : bool
         Whether to allow probabilistic subsampling when the number of groups
         exceeds the requested maximum.
 
     Raises
     ------
     TooManyGroupsError :
         When there are more groups than sequences per group and probabilistic
-        subsampling is not enabled.
+        subsampling is not allowed.
 
     Returns
     -------
     int or float :
         Number of sequences per group.
+    bool :
+        Whether probabilistic subsampling was used.
 
     """
+    probabilistic_used = False
+
     try:
         sequences_per_group = _calculate_sequences_per_group(
             target_max_value,
             counts_per_group,
         )
     except TooManyGroupsError as error:
-        if probabilistic:
+        if allow_probabilistic:
+            print(f"WARNING: {error}")
             sequences_per_group = _calculate_fractional_sequences_per_group(
                 target_max_value,
                 counts_per_group,
             )
+            probabilistic_used = True
         else:
             raise error
 
-    return sequences_per_group
+    return sequences_per_group, probabilistic_used
 
 
 class TooManyGroupsError(ValueError):

diff --git a/augur/refine.py b/augur/refine.py
@@ -242,7 +242,10 @@ def run(args):
             elif args.root in ['least-squares', 'min_dev', 'oldest']:
                 raise TypeError("The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup."%args.root)
             else:
-                T.root_with_outgroup(args.root)
+                try:
+                    T.root_with_outgroup(args.root)
+                except ValueError as err:
+                    raise ValueError(f"HINT: This error may be because your specified root with name '{args.root}' was not found in your alignment file") from err
 
         tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)