Skip to content

Commit

Permalink
Merge pull request #18 from AutoResearch/deal-with-nans
Browse files Browse the repository at this point in the history
bug: make sampler work when NaN values are predicted by the models
  • Loading branch information
younesStrittmatter authored Sep 17, 2023
2 parents 52363f2 + 6f6c55c commit 6dd86dc
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 25 deletions.
60 changes: 48 additions & 12 deletions src/autora/experimentalist/model_disagreement/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,55 @@
import itertools
from typing import Iterable, List, Union, Optional
import warnings
from typing import Iterable, List, Optional, Union

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from autora.utils.deprecation import deprecated_alias
from sklearn.preprocessing import StandardScaler


def score_sample(conditions: Union[pd.DataFrame, np.ndarray],
models: List,
num_samples: Optional[int] = None):
def score_sample(
conditions: Union[pd.DataFrame, np.ndarray],
models: List,
num_samples: Optional[int] = None,
):
"""
A experimentalist that returns selected samples for independent variables
for which the models disagree the most in terms of their predictions.
Args:
X: pool of IV conditions to evaluate in terms of model disagreement
conditions: pool of IV conditions to evaluate in terms of model disagreement
models: List of Scikit-learn (regression or classification) models to compare
num_samples: number of samples to select
Returns: Sampled pool
Examples:
If a model is undefined at a certain condition, the disagreement on that point is set to 0:
>>> class ModelUndefined:
... def predict(self, X):
... return np.log(X)
>>> class ModelDefinined:
... def predict(self, X):
... return X
>>> modelUndefined = ModelUndefined()
>>> modelDefined = ModelDefinined()
>>> conditions_defined = np.array([1, 2, 3])
>>> score_sample(conditions_defined, [modelUndefined, modelDefined], 3)
0 score
2 3 1.364948
1 2 -0.362023
0 1 -1.002924
>>> conditions_undefined = np.array([-1, 0, 1, 2, 3])
>>> score_sample(conditions_undefined, [modelUndefined, modelDefined], 5)
0 score
4 3 1.752985
3 2 0.330542
2 1 -0.197345
0 -1 -0.943091
1 0 -0.943091
"""

if isinstance(conditions, Iterable) and not isinstance(conditions, pd.DataFrame):
Expand Down Expand Up @@ -61,6 +91,12 @@ def score_sample(conditions: Union[pd.DataFrame, np.ndarray],
else:
disagreement = np.mean((y_a - y_b) ** 2, axis=1)

if np.isinf(disagreement).any() or np.isnan(disagreement).any():
warnings.warn('Found nan or inf values in model predictions, '
'setting disagreement there to 0')
disagreement[np.isinf(disagreement)] = 0
disagreement = np.nan_to_num(disagreement)

model_disagreement.append(disagreement)

assert len(model_disagreement) >= 1, "No disagreements to compare."
Expand All @@ -87,16 +123,15 @@ def score_sample(conditions: Union[pd.DataFrame, np.ndarray],
return conditions.head(num_samples)



def sample(conditions: Union[pd.DataFrame, np.ndarray],
models: List,
num_samples: int = 1):
def sample(
conditions: Union[pd.DataFrame, np.ndarray], models: List, num_samples: int = 1
):
"""
A experimentalist that returns selected samples for independent variables
for which the models disagree the most in terms of their predictions.
Args:
X: pool of IV conditions to evaluate in terms of model disagreement
conditions: pool of IV conditions to evaluate in terms of model disagreement
models: List of Scikit-learn (regression or classification) models to compare
num_samples: number of samples to select
Expand All @@ -112,4 +147,5 @@ def sample(conditions: Union[pd.DataFrame, np.ndarray],
model_disagreement_sample = sample
model_disagreement_score_sample = score_sample
model_disagreement_sampler = deprecated_alias(
model_disagreement_sample, "model_disagreement_sampler")
model_disagreement_sample, "model_disagreement_sampler"
)
38 changes: 25 additions & 13 deletions tests/test_model_disagreement_sampler.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,43 @@
from src.autora.experimentalist.model_disagreement import model_disagreement_sample, model_disagreement_score_sample
from autora.theorist.bms import BMSRegressor; BMSRegressor()
from autora.theorist.darts import DARTSRegressor; DARTSRegressor()
import numpy as np
import pandas as pd

from autora.experimentalist.model_disagreement import (
model_disagreement_sample,
model_disagreement_score_sample,
)
from autora.theorist.bms import BMSRegressor
from autora.theorist.darts import DARTSRegressor

BMSRegressor()


DARTSRegressor()


def test_output_dimensions():
#Meta-Setup
# Meta-Setup
X = np.linspace(start=-3, stop=6, num=10).reshape(-1, 1)
y = (X**2).reshape(-1, 1)
n = 5
#Theorists

# Theorists
bms_theorist = BMSRegressor(epochs=10)
darts_theorist = DARTSRegressor(max_epochs=10)

bms_theorist.fit(X,y)
darts_theorist.fit(X,y)

#Sampler
bms_theorist.fit(X, y)
darts_theorist.fit(X, y)

# Sampler
X_new = model_disagreement_sample(X, [bms_theorist, darts_theorist], n)

# Check that the sampler returns n experiment conditions
assert X_new.shape == (n, X.shape[1])


def test_pandas():
# Meta-Setup
X = np.linspace(start=-3, stop=6, num=10).reshape(-1, 1)
y = (X ** 2).reshape(-1, 1)
y = (X**2).reshape(-1, 1)
n = 5

X = pd.DataFrame(X)
Expand All @@ -45,10 +56,11 @@ def test_pandas():
assert isinstance(X_new, pd.DataFrame)
assert X_new.shape == (n, X.shape[1])


def test_scoring():
# Meta-Setup
X = np.linspace(start=-3, stop=6, num=10).reshape(-1, 1)
y = (X ** 2).reshape(-1, 1)
y = (X**2).reshape(-1, 1)
n = 5

X = pd.DataFrame(X)
Expand All @@ -66,4 +78,4 @@ def test_scoring():
# Check that the sampler returns n experiment conditions
assert isinstance(X_new, pd.DataFrame)
assert "score" in X_new.columns
assert X_new.shape == (n, X.shape[1] + 1)
assert X_new.shape == (n, X.shape[1] + 1)

0 comments on commit 6dd86dc

Please sign in to comment.