Skip to content

Commit

Permalink
Merge branch 'master' into victorlin/filter/priority-speedup
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Dec 11, 2021
2 parents 4e3e155 + a3a79ca commit 9ac13ea
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 10 deletions.
13 changes: 13 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
## __NEXT__


## 13.1.0 (10 December 2021)

### Features

* schemas: Add "$id" key to Auspice config schemas so we have a way of referring to these. [#806][] (@tsibley)

### Bug Fixes

* filter: Fix groupby with incomplete dates. [#808][] (@victorlin)

[#806]: https://github.com/nextstrain/augur/pull/806
[#808]: https://github.com/nextstrain/augur/pull/808

## 13.0.4 (8 December 2021)

### Bug Fixes
Expand Down
2 changes: 1 addition & 1 deletion augur/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '13.0.4'
__version__ = '13.1.0'


def is_augur_version_compatible(version):
Expand Down
4 changes: 2 additions & 2 deletions augur/data/schema-auspice-config-v2.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"type" : "object",
"version": "v2",
"$schema": "http://json-schema.org/draft-06/schema#",
"$id": "https://nextstrain.org/schemas/auspice/config/v2",
"type": "object",
"title": "Auspice config file to be supplied to `augur export v2`",
"$comment": "This schema includes deprecated-but-handled-by-augur-export-v1 properties, but their schema definitions are somewhat incomplete",
"additionalProperties": false,
Expand Down
4 changes: 2 additions & 2 deletions augur/data/schema-export-v1-meta.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"type" : "object",
"$schema": "http://json-schema.org/draft-06/schema#",
"version": "0.1",
"$id": "https://nextstrain.org/schemas/dataset/v1/meta",
"type": "object",
"title": "Nextstrain minimal metadata JSON schema",
"description": "This is the validation schema for the augur produced metadata JSON, for consumption in Auspice. Note that every field is optional, but excluding fields may disable certain features in Auspice.",
"additionalProperties": true,
Expand Down
3 changes: 2 additions & 1 deletion augur/data/schema-export-v1-tree.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"type" : "object",
"$schema": "http://json-schema.org/draft-06/schema#",
"$id": "https://nextstrain.org/schemas/dataset/v1/tree",
"type": "object",
"title": "Nextstrain tree JSON schema",
"additionalProperties": false,
"required": ["attr", "strain"],
Expand Down
4 changes: 2 additions & 2 deletions augur/data/schema-export-v2.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"type" : "object",
"$schema": "http://json-schema.org/draft-06/schema#",
"version": "2.0",
"$id": "https://nextstrain.org/schemas/dataset/v2",
"type": "object",
"title": "Nextstrain metadata JSON schema proposal (meta + tree together)",
"additionalProperties": false,
"required": ["version", "meta", "tree"],
Expand Down
7 changes: 5 additions & 2 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,8 +957,11 @@ def expand_date_col(metadata: pd.DataFrame, group_by_set: set) -> Tuple[pd.DataF
skipped_strains = []
# replace date with year/month/day as nullable ints
date_cols = ['year', 'month', 'day']
df_dates = (metadata_new['date'].str.split('-', n=2, expand=True)
.set_axis(date_cols, axis=1))
df_dates = metadata['date'].str.split('-', n=2, expand=True)
df_dates = df_dates.set_axis(date_cols[:len(df_dates.columns)], axis=1)
missing_date_cols = set(date_cols) - set(df_dates.columns)
for col in missing_date_cols:
df_dates[col] = pd.NA
for col in date_cols:
df_dates[col] = pd.to_numeric(df_dates[col], errors='coerce').astype(pd.Int64Dtype())
metadata_new = pd.concat([metadata_new.drop('date', axis=1), df_dates], axis=1)
Expand Down
Empty file added filtered_strains.txt
Empty file.
45 changes: 45 additions & 0 deletions tests/test_filter_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,48 @@ def test_filter_groupby_no_strains(self, valid_metadata: pd.DataFrame):
groups, group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {}
assert skipped_strains == []

def test_filter_groupby_only_year_provided(self, valid_metadata: pd.DataFrame):
groups = ['country', 'year']
metadata = valid_metadata.copy()
metadata['date'] = '2020'
strains = metadata.index.tolist()
_, group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {
'SEQ_1': ('A', 2020),
'SEQ_2': ('A', 2020),
'SEQ_3': ('B', 2020),
'SEQ_4': ('B', 2020),
'SEQ_5': ('B', 2020)
}
assert skipped_strains == []

def test_filter_groupby_month_with_only_year_provided(self, valid_metadata: pd.DataFrame):
groups = ['country', 'year', 'month']
metadata = valid_metadata.copy()
metadata['date'] = '2020'
strains = metadata.index.tolist()
_, group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {}
assert skipped_strains == [
{'strain': 'SEQ_1', 'filter': 'skip_group_by_with_ambiguous_month', 'kwargs': ''},
{'strain': 'SEQ_2', 'filter': 'skip_group_by_with_ambiguous_month', 'kwargs': ''},
{'strain': 'SEQ_3', 'filter': 'skip_group_by_with_ambiguous_month', 'kwargs': ''},
{'strain': 'SEQ_4', 'filter': 'skip_group_by_with_ambiguous_month', 'kwargs': ''},
{'strain': 'SEQ_5', 'filter': 'skip_group_by_with_ambiguous_month', 'kwargs': ''}
]

def test_filter_groupby_only_year_month_provided(self, valid_metadata: pd.DataFrame):
groups = ['country', 'year', 'month']
metadata = valid_metadata.copy()
metadata['date'] = '2020-01'
strains = metadata.index.tolist()
_, group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {
'SEQ_1': ('A', 2020, (2020, 1)),
'SEQ_2': ('A', 2020, (2020, 1)),
'SEQ_3': ('B', 2020, (2020, 1)),
'SEQ_4': ('B', 2020, (2020, 1)),
'SEQ_5': ('B', 2020, (2020, 1))
}
assert skipped_strains == []
4 changes: 4 additions & 0 deletions tmp/filtered_strains-1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
PRVABC59
COL/FLR_00008/2015
ZKC2/2016
VEN/UF_1/2016
4 changes: 4 additions & 0 deletions tmp/filtered_strains.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
PRVABC59
ZKC2/2016
VEN/UF_1/2016
BRA/2016/FC_6706

0 comments on commit 9ac13ea

Please sign in to comment.