From dace5cf971fff65e74a36522cbfa2455f44a9087 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 6 May 2024 09:19:48 -0500 Subject: [PATCH] FIX Raise error when missing-values encountered in scikit-tree trees (#264) * Implement smoke test --------- Signed-off-by: Adam Li --- .github/workflows/main.yml | 1 + build_requirements.txt | 8 ++++---- doc/whats_new/v0.8.rst | 6 ++++++ pyproject.toml | 4 ++-- sktree/tree/_neighbors.py | 4 ++++ sktree/tree/tests/test_all_trees.py | 23 ++++++++++++++++++++++- 6 files changed, 39 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index adb093b0e..52305e64c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -61,6 +61,7 @@ jobs: run: | brew install ccache brew install gcc + brew install gettext - name: show-gcc run: | diff --git a/build_requirements.txt b/build_requirements.txt index c343ceee8..13c52c519 100644 --- a/build_requirements.txt +++ b/build_requirements.txt @@ -1,9 +1,9 @@ -meson -meson-python -cython>=3.0.8 +meson>=1.4.0 +meson-python>=0.16.0 +cython>=3.0.10 ninja numpy -scikit-learn>=1.4.1 +scikit-learn>=1.4.2 click rich-click doit diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst index 0a7ba4f58..d2ffd5776 100644 --- a/doc/whats_new/v0.8.rst +++ b/doc/whats_new/v0.8.rst @@ -13,6 +13,12 @@ Version 0.8 Changelog --------- +- |Fix| Previously missing-values in ``X`` input array for sktree estimators + did not raise an error, and silently ran, assuming the missing-values were + encoded as infinity value. This is now fixed, and the estimators will raise an + ValueError if missing-values are encountered in ``X`` input array. + By `Adam Li`_ (:pr:`#264`) + Code and Documentation Contributors ----------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 0eae820d6..afab274b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] build-backend = "mesonpy" requires = [ - "meson-python>=0.15.0", + "meson-python>=0.16.0", 'ninja', # `wheel` is needed for non-isolated builds, given that `meson-python` # doesn't list it as a runtime requirement (at least in 0.10.0) @@ -9,7 +9,7 @@ requires = [ "wheel", "setuptools<=65.5", "packaging", - "Cython>=3.0.8", + "Cython>=3.0.10", "scikit-learn>=1.4.1", "scipy>=1.5.0", "numpy>=1.25; python_version>='3.9'" diff --git a/sktree/tree/_neighbors.py b/sktree/tree/_neighbors.py index 93d8ff1a0..7b02be7ce 100644 --- a/sktree/tree/_neighbors.py +++ b/sktree/tree/_neighbors.py @@ -64,3 +64,7 @@ def compute_similarity_matrix(self, X): The similarity matrix among the samples. """ return compute_forest_similarity_matrix(self, X) + + def _more_tags(self): + # XXX: no scikit-tree estimators support NaNs as of now + return {"allow_nan": False} diff --git a/sktree/tree/tests/test_all_trees.py b/sktree/tree/tests/test_all_trees.py index 66a9ea307..ab0e4fcf3 100644 --- a/sktree/tree/tests/test_all_trees.py +++ b/sktree/tree/tests/test_all_trees.py @@ -3,7 +3,7 @@ import pytest from numpy.testing import assert_almost_equal, assert_array_equal from sklearn.base import is_classifier -from sklearn.datasets import make_blobs +from sklearn.datasets import load_iris, make_blobs from sklearn.tree._tree import TREE_LEAF from sktree.tree import ( @@ -162,3 +162,24 @@ def test_similarity_matrix(tree): assert np.allclose(sim_mat, sim_mat.T) assert np.all((sim_mat.diagonal() == 1)) + + +@pytest.mark.parametrize("tree", ALL_TREES) +def test_missing_values(tree): + """Smoke test to ensure that correct error is raised when missing values are present. + + xref: https://github.com/neurodata/scikit-tree/issues/263 + """ + rng = np.random.default_rng(123) + + iris_X, iris_y = load_iris(return_X_y=True, as_frame=True) + + # Make the feature matrix 25% sparse + iris_X = iris_X.mask(rng.standard_normal(iris_X.shape) < 0.25) + + classifier = tree() + with pytest.raises(ValueError, match="Input X contains NaN"): + if tree.__name__.startswith("Unsupervised"): + classifier.fit(iris_X) + else: + classifier.fit(iris_X, iris_y)