Skip to content

Commit

Permalink
added bottleneck for nan calculations (#306)
Browse files Browse the repository at this point in the history
* added bottleneck for nan cacluations
* added bottleneck to test requirements
  • Loading branch information
ryanhausen committed Aug 2, 2024
1 parent 867bc33 commit e8c7de5
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 9 deletions.
1 change: 1 addition & 0 deletions doc/whats_new/_contributors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@
.. _Ronan Perry : https://rflperry.github.io/
.. _Haoyin Xu : https://github.com/PSSF23
.. _Yuxin Bai : https://github.com/YuxinB
.. _Ryan Hausen : https://ryanhausen.github.io
4 changes: 4 additions & 0 deletions doc/whats_new/v0.10.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ Version 0.10
Changelog
---------

- |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)


Code and Documentation Contributors
-----------------------------------
Expand All @@ -21,3 +24,4 @@ Thanks to everyone who has contributed to the maintenance and improvement of
the project since version inception, including:

* `Adam Li`_
* `Ryan Hausen`_
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ all = [
'treeple[build]',
'treeple[doc]',
'treeple[style]',
'treeple[test]'
'treeple[test]',
'treeple[extra]'
]
build = [
'build',
Expand Down Expand Up @@ -123,6 +124,9 @@ test = [
'flaky',
'tqdm'
]
extra = [
'bottleneck'
]

[tool.bandit]
exclude_dirs = ["treeple/tests", "treeple/**/tests/*", 'treeple/_build_utils/*', 'treeple/_lib/*']
Expand Down
3 changes: 2 additions & 1 deletion test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pytest
pytest-cov
memory_profiler
flaky
tqdm
tqdm
bottleneck
23 changes: 19 additions & 4 deletions treeple/stats/tests/test_forestht.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import importlib
import os

import numpy as np
import pytest
from flaky import flaky
from numpy.testing import assert_almost_equal, assert_array_equal
from sklearn import datasets

import treeple.stats as stats
import treeple.stats.utils as utils
from treeple import HonestForestClassifier, RandomForestClassifier
from treeple.stats import (
PermutationHonestForestClassifier,
Expand Down Expand Up @@ -236,11 +241,21 @@ def test_comight_repeated_feature_sets(seed):
assert result.pvalue > 0.05, f"{result.pvalue}"


def test_build_coleman_forest():
@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_build_coleman_forest(use_bottleneck: bool):
"""Simple test for building a Coleman forest.
Test the function under alternative and null hypothesis for a very simple dataset.
"""
if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
importlib.reload(stats)
else:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)
importlib.reload(stats)

n_estimators = 100
n_samples = 30
n_features = 5
Expand Down Expand Up @@ -273,10 +288,10 @@ def test_build_coleman_forest():
with pytest.raises(
RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier"
):
build_coleman_forest(clf, clf, X, y)
stats.build_coleman_forest(clf, clf, X, y)

forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = (
build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
)
assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
Expand All @@ -287,7 +302,7 @@ def test_build_coleman_forest():
assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape)

X = np.vstack([_X, _X])
forest_result, _, _, clf_fitted, perm_clf_fitted = build_coleman_forest(
forest_result, _, _, clf_fitted, perm_clf_fitted = stats.build_coleman_forest(
clf, perm_clf, X, y, metric="s@98"
)
assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}"
Expand Down
36 changes: 36 additions & 0 deletions treeple/stats/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import importlib
import os

import numpy as np
import pytest
from numpy.testing import assert_array_equal

import treeple.stats.utils as utils
from treeple import HonestForestClassifier
from treeple.stats.utils import get_per_tree_oob_samples

Expand Down Expand Up @@ -32,3 +36,35 @@ def test_get_per_tree_oob_samples(bootstrap):
else:
with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"):
get_per_tree_oob_samples(est)


@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_non_nan_samples(use_bottleneck: bool):

if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
else:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)

posterior_array = np.array(
[
# tree 1
[
[0, 1],
[np.nan, np.nan],
[np.nan, np.nan],
],
# tree 2
[
[0, 1],
[np.nan, np.nan],
[1, 0],
],
]
) # [2, 3, 2]

expected = np.array([0, 2])
actual = utils._non_nan_samples(posterior_array)
np.testing.assert_array_equal(expected, actual)
27 changes: 24 additions & 3 deletions treeple/stats/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys
import warnings
from typing import Optional, Tuple

import numpy as np
Expand All @@ -16,6 +19,24 @@

from treeple._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier

BOTTLENECK_AVAILABLE = False
if "bottleneck" in sys.modules:
import bottleneck as bn

BOTTLENECK_AVAILABLE = True

DISABLE_BN_ENV_VAR = "TREEPLE_NO_BOTTLENECK"

if BOTTLENECK_AVAILABLE and DISABLE_BN_ENV_VAR not in os.environ:
nanmean_f = bn.nanmean
anynan_f = lambda arr: bn.anynan(arr, axis=2)
else:
warnings.warn(
"Not using bottleneck for calculations involvings nans. Expect slower performance."
)
nanmean_f = np.nanmean
anynan_f = lambda arr: np.isnan(arr).any(axis=2)


def _mutual_information(y_true: ArrayLike, y_pred_proba: ArrayLike) -> float:
"""Compute estimate of mutual information for supervised classification setting.
Expand Down Expand Up @@ -131,7 +152,7 @@ def _non_nan_samples(posterior_arr: ArrayLike) -> ArrayLike:
along axis=1.
"""
# Find the row indices with NaN values along the specified axis
nan_indices = np.isnan(posterior_arr).any(axis=2).all(axis=0)
nan_indices = anynan_f(posterior_arr).all(axis=0)

# Invert the boolean mask to get indices without NaN values
nonnan_indices = np.where(~nan_indices)[0]
Expand Down Expand Up @@ -320,8 +341,8 @@ def _parallel_build_null_forests(
# first_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_first_half)
# second_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_second_half)

y_pred_first_half = np.nanmean(first_forest_pred[:, first_forest_samples, :], axis=0)
y_pred_second_half = np.nanmean(second_forest_pred[:, second_forest_samples, :], axis=0)
y_pred_first_half = nanmean_f(first_forest_pred[:, first_forest_samples, :], axis=0)
y_pred_second_half = nanmean_f(second_forest_pred[:, second_forest_samples, :], axis=0)

# compute two instances of the metric from the sampled trees
first_half_metric = metric_func(
Expand Down

0 comments on commit e8c7de5

Please sign in to comment.