added bottleneck for nan calculations (#306)

* added bottleneck for nan cacluations * added bottleneck to test requirements
neurodata · Aug 2, 2024 · e8c7de5 · e8c7de5
1 parent 867bc33
commit e8c7de5
Show file tree

Hide file tree

Showing 7 changed files with 91 additions and 9 deletions.
diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst
@@ -27,3 +27,4 @@
 .. _Ronan Perry : https://rflperry.github.io/
 .. _Haoyin Xu : https://github.com/PSSF23
 .. _Yuxin Bai : https://github.com/YuxinB
+.. _Ryan Hausen : https://ryanhausen.github.io
diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst
@@ -13,6 +13,9 @@ Version 0.10
 Changelog
 ---------
 
+- |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
+    ``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)
+
 
 Code and Documentation Contributors
 -----------------------------------
@@ -21,3 +24,4 @@ Thanks to everyone who has contributed to the maintenance and improvement of
 the project since version inception, including:
 
 * `Adam Li`_
+* `Ryan Hausen`_
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,7 +60,8 @@ all = [
   'treeple[build]',
   'treeple[doc]',
   'treeple[style]',
-  'treeple[test]'
+  'treeple[test]',
+  'treeple[extra]'
 ]
 build = [
   'build',
@@ -123,6 +124,9 @@ test = [
   'flaky',
   'tqdm'
 ]
+extra = [
+  'bottleneck'
+]
 
 [tool.bandit]
 exclude_dirs = ["treeple/tests", "treeple/**/tests/*", 'treeple/_build_utils/*', 'treeple/_lib/*']

diff --git a/test_requirements.txt b/test_requirements.txt
@@ -4,4 +4,5 @@ pytest
 pytest-cov
 memory_profiler
 flaky
-tqdm
+tqdm
+bottleneck
diff --git a/treeple/stats/tests/test_forestht.py b/treeple/stats/tests/test_forestht.py
@@ -1,9 +1,14 @@
+import importlib
+import os
+
 import numpy as np
 import pytest
 from flaky import flaky
 from numpy.testing import assert_almost_equal, assert_array_equal
 from sklearn import datasets
 
+import treeple.stats as stats
+import treeple.stats.utils as utils
 from treeple import HonestForestClassifier, RandomForestClassifier
 from treeple.stats import (
     PermutationHonestForestClassifier,
@@ -236,11 +241,21 @@ def test_comight_repeated_feature_sets(seed):
     assert result.pvalue > 0.05, f"{result.pvalue}"
 
 
-def test_build_coleman_forest():
+@pytest.mark.parametrize("use_bottleneck", [True, False])
+def test_build_coleman_forest(use_bottleneck: bool):
     """Simple test for building a Coleman forest.
 
     Test the function under alternative and null hypothesis for a very simple dataset.
     """
+    if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
+        del os.environ[utils.DISABLE_BN_ENV_VAR]
+        importlib.reload(utils)
+        importlib.reload(stats)
+    else:
+        os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
+        importlib.reload(utils)
+        importlib.reload(stats)
+
     n_estimators = 100
     n_samples = 30
     n_features = 5
@@ -273,10 +288,10 @@ def test_build_coleman_forest():
     with pytest.raises(
         RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier"
     ):
-        build_coleman_forest(clf, clf, X, y)
+        stats.build_coleman_forest(clf, clf, X, y)
 
     forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = (
-        build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
+        stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
     )
     assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
     assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
@@ -287,7 +302,7 @@ def test_build_coleman_forest():
     assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape)
 
     X = np.vstack([_X, _X])
-    forest_result, _, _, clf_fitted, perm_clf_fitted = build_coleman_forest(
+    forest_result, _, _, clf_fitted, perm_clf_fitted = stats.build_coleman_forest(
         clf, perm_clf, X, y, metric="s@98"
     )
     assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}"

diff --git a/treeple/stats/tests/test_utils.py b/treeple/stats/tests/test_utils.py
@@ -1,7 +1,11 @@
+import importlib
+import os
+
 import numpy as np
 import pytest
 from numpy.testing import assert_array_equal
 
+import treeple.stats.utils as utils
 from treeple import HonestForestClassifier
 from treeple.stats.utils import get_per_tree_oob_samples
 
@@ -32,3 +36,35 @@ def test_get_per_tree_oob_samples(bootstrap):
     else:
         with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"):
             get_per_tree_oob_samples(est)
+
+
+@pytest.mark.parametrize("use_bottleneck", [True, False])
+def test_non_nan_samples(use_bottleneck: bool):
+
+    if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
+        del os.environ[utils.DISABLE_BN_ENV_VAR]
+        importlib.reload(utils)
+    else:
+        os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
+        importlib.reload(utils)
+
+    posterior_array = np.array(
+        [
+            # tree 1
+            [
+                [0, 1],
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+            ],
+            # tree 2
+            [
+                [0, 1],
+                [np.nan, np.nan],
+                [1, 0],
+            ],
+        ]
+    )  # [2, 3, 2]
+
+    expected = np.array([0, 2])
+    actual = utils._non_nan_samples(posterior_array)
+    np.testing.assert_array_equal(expected, actual)
diff --git a/treeple/stats/utils.py b/treeple/stats/utils.py
@@ -1,3 +1,6 @@
+import os
+import sys
+import warnings
 from typing import Optional, Tuple
 
 import numpy as np
@@ -16,6 +19,24 @@
 
 from treeple._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier
 
+BOTTLENECK_AVAILABLE = False
+if "bottleneck" in sys.modules:
+    import bottleneck as bn
+
+    BOTTLENECK_AVAILABLE = True
+
+DISABLE_BN_ENV_VAR = "TREEPLE_NO_BOTTLENECK"
+
+if BOTTLENECK_AVAILABLE and DISABLE_BN_ENV_VAR not in os.environ:
+    nanmean_f = bn.nanmean
+    anynan_f = lambda arr: bn.anynan(arr, axis=2)
+else:
+    warnings.warn(
+        "Not using bottleneck for calculations involvings nans. Expect slower performance."
+    )
+    nanmean_f = np.nanmean
+    anynan_f = lambda arr: np.isnan(arr).any(axis=2)
+
 
 def _mutual_information(y_true: ArrayLike, y_pred_proba: ArrayLike) -> float:
     """Compute estimate of mutual information for supervised classification setting.
@@ -131,7 +152,7 @@ def _non_nan_samples(posterior_arr: ArrayLike) -> ArrayLike:
         along axis=1.
     """
     # Find the row indices with NaN values along the specified axis
-    nan_indices = np.isnan(posterior_arr).any(axis=2).all(axis=0)
+    nan_indices = anynan_f(posterior_arr).all(axis=0)
 
     # Invert the boolean mask to get indices without NaN values
     nonnan_indices = np.where(~nan_indices)[0]
@@ -320,8 +341,8 @@ def _parallel_build_null_forests(
     # first_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_first_half)
     # second_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_second_half)
 
-    y_pred_first_half = np.nanmean(first_forest_pred[:, first_forest_samples, :], axis=0)
-    y_pred_second_half = np.nanmean(second_forest_pred[:, second_forest_samples, :], axis=0)
+    y_pred_first_half = nanmean_f(first_forest_pred[:, first_forest_samples, :], axis=0)
+    y_pred_second_half = nanmean_f(second_forest_pred[:, second_forest_samples, :], axis=0)
 
     # compute two instances of the metric from the sampled trees
     first_half_metric = metric_func(