From f82e2d2a767e2f41d6d5235b0ee31e7fbdc33b8b Mon Sep 17 00:00:00 2001 From: radlfabs Date: Fri, 10 Nov 2023 19:12:53 +0100 Subject: [PATCH 1/6] allow method types: our enum|cv generator|iterator update method names --- flexcv/split.py | 203 ++++++++++++++---------------------------------- 1 file changed, 60 insertions(+), 143 deletions(-) diff --git a/flexcv/split.py b/flexcv/split.py index c0a7a34..f342782 100644 --- a/flexcv/split.py +++ b/flexcv/split.py @@ -9,17 +9,21 @@ from typing import Callable, Iterator import pandas as pd -import numpy as np from numpy import ndarray -from numpy.core._exceptions import UFuncTypeError from sklearn.model_selection import ( BaseCrossValidator, + GroupsConsumerMixin, GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold, ) -from sklearn.preprocessing import KBinsDiscretizer + +from .stratification import ( + ContinuousStratifiedKFold, + ContinuousStratifiedGroupKFold, + ConcatenatedStratifiedKFold +) class CrossValMethod(Enum): @@ -28,20 +32,21 @@ class CrossValMethod(Enum): Members: - `KFOLD`: Regular sklearn `KFold` cross validation. No grouping information is used. - - `CUSTOMSTRAT`: Applies stratification on the target variable using a custom discretization of the target variable. - I.e. uses the sklearn `StratifiedKFold` cross validation but for a continuous target variable instead of a multi-class target variable. - - `GROUP`: Applies grouping information on the samples. I.e. uses the sklearn `GroupKFold` cross validation. - - `STRATGROUP`: Uses the sklearn `StratifiedGroupKFold` cross validation. - - `CUSTOMSTRATGROUP`: Applies stratification to both the target variable and the grouping information. - I.e. uses the sklearn `StratifiedGroupKFold` cross validation but for a continuous target variable instead of a multi-class target variable. - + - `GROUP`: Regular sklearn `GroupKFold` cross validation. Grouping information is used. + - `STRAT`: Regular sklearn `StratifiedKFold` cross validation. No grouping information is used. + - `STRATGROUP`: Regular sklearn `StratifiedGroupKFold` cross validation. Grouping information is used. + - `CONTISTRAT`: Stratified cross validation for continuous targets. No grouping information is used. + - `CONTISTRATGROUP`: Stratified cross validation for continuous targets. Grouping information is used. + - `CONCATSTRATKFOLD`: Stratified cross validation. Leaky stratification on element-wise-concatenated target and group labels. """ KFOLD = "KFold" GROUP = "GroupKFold" - CUSTOMSTRAT = "CustomStratifiedKFold" + STRAT = "StratifiedKFold" STRATGROUP = "StratifiedGroupKFold" - CUSTOMSTRATGROUP = "CustomStratifiedGroupKFold" + CONTISTRAT = "ContinuousStratifiedKFold" + CONTISTRATGROUP = "ContinuousStratifiedGroupKFold" + CONCATSTRATKFOLD = "ConcatenatedStratifiedKFold" def string_to_crossvalmethod(method: str) -> CrossValMethod: @@ -67,113 +72,6 @@ def string_to_crossvalmethod(method: str) -> CrossValMethod: raise ValueError("Invalid Cross Validation method given.") -class CustomStratifiedGroupKFold(BaseCrossValidator): - """sklearn's StratifiedGroupKFold adapted for continuous target variables.""" - - def __init__(self, n_splits, shuffle=True, random_state=42, groups=None): - self.n_splits = n_splits - self.shuffle = shuffle - self.random_state = random_state - self.groups = groups - - def split(self, X, y, groups=None): - """Generate indices to split data into training and test set. - The data is first grouped by groups and then split into n_splits folds. The folds are made by preserving the percentage of samples for each class. - This is a variation of StratifiedGroupKFold that uses a custom discretization of the target variable. - - Args: - X (array-like): Features - y (array-like): target - groups (array-like): Grouping/clustering variable (Default value = None) - - Returns: - (Iterator[tuple[ndarray, ndarray]]): Iterator over the indices of the training and test set. - """ - self.sgkf = StratifiedGroupKFold( - n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state - ) - assert y is not None, "y cannot be None" - kbins = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") - if isinstance(y, pd.Series): - y_cat = ( - kbins.fit_transform(y.to_numpy().reshape(-1, 1)).flatten().astype(int) - ) - y_cat = pd.Series(y_cat, index=y.index) - else: - y_cat = kbins.fit_transform(y.reshape(-1, 1)).flatten().astype(int) # type: ignore - return self.sgkf.split(X, y_cat, groups) - - def get_n_splits(self, X, y=None, groups=None): - """ - Returns the number of splitting iterations in the cross-validator. - - Returns: - (int): The number of splitting iterations in the cross-validator. - """ - return self.n_splits - - -class CustomStratifiedKFold(BaseCrossValidator): - """Cross Validation Method. - This is a variation of StratifiedKFold that uses a custom discretization of the target variable. - Stratification is done on the concatination of discretized target variable and group instead of the original target variable. - This ensures, that distributions of the target variable per group are similar in each fold. - """ - def __init__(self, n_splits, shuffle=True, random_state=42, groups=None): - self.n_splits = n_splits - self.shuffle = shuffle - self.random_state = random_state - self.groups = groups - - def split(self, X, y, groups=None): - """Generate indices to split data into training and test set. - The data is first grouped by groups and then split into n_splits folds. The folds are made by preserving the percentage of samples for each class. - This is a variation of StratifiedGroupKFold that uses a custom discretization of the target variable. - - Args: - X (array-like): Features - y (array-like): target - groups (array-like): Grouping variable (Default value = None) - - Returns: - (Iterator[tuple[ndarray, ndarray]]): Iterator over the indices of the training and test set. - """ - self.skf = StratifiedKFold( - n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state - ) - assert y is not None, "y cannot be None" - kbins = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") - if isinstance(y, pd.Series): - y_cat = ( - kbins.fit_transform(y.to_numpy().reshape(-1, 1)).flatten().astype(int) - ) - y_cat = pd.Series(y_cat, index=y.index) - else: - y_cat = kbins.fit_transform(y.reshape(-1, 1)).flatten().astype(int) # type: ignore - # concatenate y_cat and groups such that the stratification is done on both - # elementwise concatenation of three arrays - try: - y_cat = y_cat.astype(str) + "_" + groups.astype(str) - except UFuncTypeError: - # Why easy when you can do it the hard way? - y_concat = np.core.defchararray.add(np.core.defchararray.add(y_cat.astype(str), "_"), groups.astype(str)) - - return self.skf.split(X, y_concat) - - def get_n_splits(self, X, y=None, groups=None): - """ - - Args: - X (array-like): Features - y (array-like): target values. (Default value = None) - groups (array-like): grouping values. (Default value = None) - - Returns: - (int) : The number of splitting iterations in the cross-validator. - """ - return self.n_splits - - def make_cross_val_split( *, groups: pd.Series | None, @@ -196,45 +94,64 @@ def make_cross_val_split( (TypeError): If the given method is not one of KFOLD """ - + match method: case CrossValMethod.KFOLD: - cross_val_obj = KFold( + kf = KFold( n_splits=n_splits, random_state=random_state, shuffle=True ) - return cross_val_obj.split - + return kf.split + + case CrossValMethod.STRAT: + strat_skf = StratifiedKFold( + n_splits=n_splits, random_state=random_state, shuffle=True + ) + return strat_skf.split + + case CrossValMethod.CONTISTRAT: + conti_skf = ContinuousStratifiedKFold( + n_splits=n_splits, random_state=random_state, shuffle=True + ) + return conti_skf.split + case CrossValMethod.GROUP: - if groups is None: - raise ValueError("Groups must be specified for GroupKFold.") - cross_val_obj = GroupKFold(n_splits=n_splits) - return partial(cross_val_obj.split, groups=groups) - + gkf = GroupKFold(n_splits=n_splits) + return partial(gkf.split, groups=groups) + case CrossValMethod.STRATGROUP: - if groups is None: - raise ValueError("Groups must be specified for StratGroupKFold.") - cross_val_obj = StratifiedGroupKFold( + strat_gkf = StratifiedGroupKFold( n_splits=n_splits, random_state=random_state, shuffle=True ) - return partial(cross_val_obj.split, groups=groups) + return partial(strat_gkf.split, groups=groups) - case CrossValMethod.CUSTOMSTRATGROUP: - if groups is None: - raise ValueError("Groups must be specified for CustomStratGroupKFold.") - cross_val_obj = CustomStratifiedGroupKFold( + case CrossValMethod.CONTISTRATGROUP: + conti_sgkf = ContinuousStratifiedGroupKFold( n_splits=n_splits, random_state=random_state, shuffle=True ) - return partial(cross_val_obj.split, groups=groups) + return partial(conti_sgkf.split, groups=groups) - case CrossValMethod.CUSTOMSTRAT: - if groups is None: - raise ValueError("Groups must be specified for our StratifiedKFold.") - cross_val_obj = CustomStratifiedKFold( + case CrossValMethod.CONCATSTRATKFOLD: + concat_skf = ConcatenatedStratifiedKFold( n_splits=n_splits, random_state=random_state, shuffle=True ) - return partial(cross_val_obj.split, groups=groups) + return partial(concat_skf.split, groups=groups) + case _: - raise TypeError("Invalid Cross Validation method given.") + + is_cross_validator = isinstance(method, BaseCrossValidator) + is_groups_consumer = isinstance(method, GroupsConsumerMixin) + + if is_cross_validator and is_groups_consumer: + return partial(method.split, groups=groups) + + if is_cross_validator: + return method.split + + if isinstance(method, Iterator): + return method + + else: + raise TypeError("Invalid Cross Validation method given.") if __name__ == "__main__": From 51d14453d85694105c6214de1bf1ed1c8f13a6ea Mon Sep 17 00:00:00 2001 From: radlfabs Date: Fri, 10 Nov 2023 19:14:02 +0100 Subject: [PATCH 2/6] move custom stratifications to new module --- flexcv/stratification.py | 188 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 flexcv/stratification.py diff --git a/flexcv/stratification.py b/flexcv/stratification.py new file mode 100644 index 0000000..d9280aa --- /dev/null +++ b/flexcv/stratification.py @@ -0,0 +1,188 @@ +"""This module implements two stratificsation methods that can be used in contexts of regression of hierarchical (i.e. where the target is continuous and the data is grouped). +""" +import pandas as pd +import numpy as np + +from numpy.core._exceptions import UFuncTypeError +from sklearn.model_selection import ( + BaseCrossValidator, + GroupsConsumerMixin, + StratifiedGroupKFold, + StratifiedKFold, +) +from sklearn.preprocessing import KBinsDiscretizer + + +class ContinuousStratifiedKFold(BaseCrossValidator): + """Continuous Stratified k-Folds cross validator, i.e. it works with *continuous* target variables instead of multiclass targets. + + This is a variation of StratifiedKFold that + + - makes a copy of the target variable and discretizes it. + - applies stratified k-folds based on this discrete target to ensure equal percentile distribution across folds + - does not further use or pass this discrete target. + - does not apply grouping rules. + """ + def __init__(self, n_splits, shuffle=True, random_state=42, groups=None): + self.n_splits = n_splits + self.shuffle = shuffle + self.random_state = random_state + self.groups = groups + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + The folds are made by preserving the percentage of samples for each class. + This is a variation of StratifiedGroupKFold that uses a custom discretization of the target variable. + + Args: + X (array-like): Features + y (array-like): target + groups (array-like): Grouping variable (Default value = None) + + Returns: + (Iterator[tuple[ndarray, ndarray]]): Iterator over the indices of the training and test set. + """ + self.skf = StratifiedKFold( + n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state + ) + assert y is not None, "y cannot be None" + kbins = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") + if isinstance(y, pd.Series): + y_cat = ( + kbins.fit_transform(y.to_numpy().reshape(-1, 1)).flatten().astype(int) + ) + y_cat = pd.Series(y_cat, index=y.index) + else: + y_cat = kbins.fit_transform(y.reshape(-1, 1)).flatten().astype(int) # type: ignore + + return self.skf.split(X, y_cat) + + def get_n_splits(self, X, y=None, groups=None): + """ + + Args: + X (array-like): Features + y (array-like): target values. (Default value = None) + groups (array-like): grouping values. (Default value = None) + + Returns: + (int) : The number of splitting iterations in the cross-validator. + """ + return self.n_splits + + +class ContinuousStratifiedGroupKFold(GroupsConsumerMixin, BaseCrossValidator): + """Continuous Stratified Group k-Folds cross validator. + This is a variation of StratifiedKFold that + - makes a temporal discretization of the target variable. + - apply stratified group k-fold based on the passed groups and the discretized target. + - does not further use this discretized target + - tries to preserve the percentage of samples in each percentile per group given the constraint of non-overlapping groups + """ + + def __init__(self, n_splits, shuffle=True, random_state=42, groups=None): + self.n_splits = n_splits + self.shuffle = shuffle + self.random_state = random_state + self.groups = groups + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + The data is first grouped by groups and then split into n_splits folds. The folds are made by preserving the percentage of samples for each class. + This is a variation of StratifiedGroupKFold that uses a custom discretization of the target variable. + + Args: + X (array-like): Features + y (array-like): target + groups (array-like): Grouping/clustering variable (Default value = None) + + Returns: + (Iterator[tuple[ndarray, ndarray]]): Iterator over the indices of the training and test set. + """ + self.sgkf = StratifiedGroupKFold( + n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state + ) + assert y is not None, "y cannot be None" + kbins = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") + if isinstance(y, pd.Series): + y_cat = ( + kbins.fit_transform(y.to_numpy().reshape(-1, 1)).flatten().astype(int) + ) + y_cat = pd.Series(y_cat, index=y.index) + else: + y_cat = kbins.fit_transform(y.reshape(-1, 1)).flatten().astype(int) # type: ignore + return self.sgkf.split(X, y_cat, groups) + + def get_n_splits(self, X, y=None, groups=None): + """ + Returns the number of splitting iterations in the cross-validator. + + Returns: + (int): The number of splitting iterations in the cross-validator. + """ + return self.n_splits + + +class ConcatenatedStratifiedKFold(GroupsConsumerMixin, BaseCrossValidator): + """Group Concatenated Continuous Stratified k-Folds cross validator. + This is a variation of StratifiedKFold that uses a concatenation of target and grouping variable. + + - The target is discretized. + - Each discrete target label is casted to type(str) and concatenated with the grouping label + - Stratification is applied to this new temporal concatenated target + - This preserves the group's *and* the targets distribution in each fold to be roughly equal to the input distribution + - The procedure allows overlapping groups which could be interpreted as data leakage in many cases. + - Population (i.e. the input data set) distribution is leaking into the folds' distribution. + """ + def __init__(self, n_splits, shuffle=True, random_state=42, groups=None): + self.n_splits = n_splits + self.shuffle = shuffle + self.random_state = random_state + self.groups = groups + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + Applies target discretization, row-wise concatenation with the group-label, and stratification on this temporal concatenated column. + + Args: + X (array-like): Features + y (array-like): target + groups (array-like): Grouping variable (Default value = None) + + Returns: + (Iterator[tuple[ndarray, ndarray]]): Iterator over the indices of the training and test set. + """ + self.skf = StratifiedKFold( + n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state + ) + assert y is not None, "y cannot be None" + kbins = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") + if isinstance(y, pd.Series): + y_cat = ( + kbins.fit_transform(y.to_numpy().reshape(-1, 1)).flatten().astype(int) + ) + y_cat = pd.Series(y_cat, index=y.index) + else: + y_cat = kbins.fit_transform(y.reshape(-1, 1)).flatten().astype(int) # type: ignore + # concatenate y_cat and groups such that the stratification is done on both + # elementwise concatenation of three arrays + try: + y_concat = y_cat.astype(str) + "_" + groups.astype(str) + except UFuncTypeError: + # Why easy when you can do it the hard way? + y_concat = np.core.defchararray.add(np.core.defchararray.add(y_cat.astype(str), "_"), groups.astype(str)) + + return self.skf.split(X, y_concat) + + def get_n_splits(self, X, y=None, groups=None): + """ + + Args: + X (array-like): Features + y (array-like): target values. (Default value = None) + groups (array-like): grouping values. (Default value = None) + + Returns: + (int) : The number of splitting iterations in the cross-validator. + """ + return self.n_splits From d543ee1aafc3c89410ca4938e9caa48d171ec4b4 Mon Sep 17 00:00:00 2001 From: radlfabs Date: Fri, 10 Nov 2023 20:24:10 +0100 Subject: [PATCH 3/6] move check for mapped n_trials to interface --- flexcv/core.py | 7 +------ flexcv/interface.py | 9 +++++++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/flexcv/core.py b/flexcv/core.py index 7623c1c..75df96e 100644 --- a/flexcv/core.py +++ b/flexcv/core.py @@ -303,7 +303,7 @@ def cross_validate( else: # this block performs the inner cross-validation with Optuna - n_trials + n_trials = mapping[model_name]["n_trials"] n_jobs_cv_int = mapping[model_name]["n_jobs_cv"] pipe_in = Pipeline( @@ -328,11 +328,6 @@ def cross_validate( study_update_freq=10, # log every 10th trial, ) - if n_trials == "mapped": # TODO can this be automatically detected by the CrossValidation Class? - n_trials = mapping[model_name]["n_trials"] - if not isinstance(n_trials, int): - raise ValueError("Invalid value for n_trials.") - # generate numpy random_state object for seeding the sampler random_state = check_random_state(random_seed) sampler_seed = random_state.randint(0, np.iinfo("int32").max) diff --git a/flexcv/interface.py b/flexcv/interface.py index 04a9ae7..e0775b4 100644 --- a/flexcv/interface.py +++ b/flexcv/interface.py @@ -441,6 +441,15 @@ def _prepare_before_perform(self): if not hasattr(self.config, "run"): self.config["run"] = DummyRun() + + # check for every key in config, if "n_trials" is set + # if not, set to the value of self.config["n_trials"] + for model_key, inner_dict in self.config["mapping"].items(): + if "n_trials" not in inner_dict: + self.config["mapping"][model_key]["n_trials"] = self.config["n_trials"] + + elif not isinstance(inner_dict["n_trials"], int): + raise TypeError("n_trials must be an integer") def _log(self): """Logs the config to Neptune. If None, a Dummy is instantiated. From 69002d450e31ef3b041631b24a9f12202eddad89 Mon Sep 17 00:00:00 2001 From: radlfabs Date: Fri, 10 Nov 2023 21:09:34 +0100 Subject: [PATCH 4/6] fix bug where get_n_splits raises with required X --- flexcv/stratification.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/flexcv/stratification.py b/flexcv/stratification.py index d9280aa..a2976aa 100644 --- a/flexcv/stratification.py +++ b/flexcv/stratification.py @@ -4,9 +4,11 @@ import numpy as np from numpy.core._exceptions import UFuncTypeError -from sklearn.model_selection import ( +from sklearn.model_selection._split import ( BaseCrossValidator, GroupsConsumerMixin, +) +from sklearn.model_selection import ( StratifiedGroupKFold, StratifiedKFold, ) @@ -57,7 +59,7 @@ def split(self, X, y, groups=None): return self.skf.split(X, y_cat) - def get_n_splits(self, X, y=None, groups=None): + def get_n_splits(self, X=None, y=None, groups=None): """ Args: @@ -113,7 +115,7 @@ def split(self, X, y, groups=None): y_cat = kbins.fit_transform(y.reshape(-1, 1)).flatten().astype(int) # type: ignore return self.sgkf.split(X, y_cat, groups) - def get_n_splits(self, X, y=None, groups=None): + def get_n_splits(self, X=None, y=None, groups=None): """ Returns the number of splitting iterations in the cross-validator. @@ -174,7 +176,7 @@ def split(self, X, y, groups=None): return self.skf.split(X, y_concat) - def get_n_splits(self, X, y=None, groups=None): + def get_n_splits(self, X=None, y=None, groups=None): """ Args: From aab1592db2d6d9f8f0a6d87984d3c077a8911e9e Mon Sep 17 00:00:00 2001 From: radlfabs Date: Fri, 10 Nov 2023 21:18:33 +0100 Subject: [PATCH 5/6] add new method types to func signature fix bug in import error fix bug where method of type kfold raises AttributeError .value --- flexcv/core.py | 9 +++++---- flexcv/interface.py | 18 +++++++++++++----- flexcv/split.py | 4 +++- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/flexcv/core.py b/flexcv/core.py index 75df96e..0f0b876 100644 --- a/flexcv/core.py +++ b/flexcv/core.py @@ -1,6 +1,6 @@ import logging import warnings -from typing import Dict +from typing import Dict, Iterator import numpy as np import optuna @@ -11,6 +11,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_random_state from statsmodels.tools.sm_exceptions import ConvergenceWarning +from sklearn.model_selection._split import BaseCrossValidator from tqdm import tqdm from .cv_logging import ( @@ -111,8 +112,8 @@ def cross_validate( run: NeptuneRun, groups: pd.Series, slopes: pd.DataFrame | pd.Series, - split_out: CrossValMethod, - split_in: CrossValMethod, + split_out: CrossValMethod | BaseCrossValidator | Iterator, + split_in: CrossValMethod | BaseCrossValidator | Iterator, break_cross_val: bool, scale_in: bool, scale_out: bool, @@ -140,7 +141,7 @@ def cross_validate( run (NeptuneRun): A Run object to log to. groups (pd.Series): The grouping or clustering variable. slopes (pd.DataFrame | pd.Series): Random slopes variable(s) - split_out (CrossValMethod): Outer split strategy. + split_out (CrossValMethod | BaseCross): Outer split strategy. split_in (CrossValMethod): Inner split strategy. break_cross_val (bool): If True, only the first outer fold is evaluated. scale_in (bool): If True, the features are scaled in the inner cross-validation to zero mean and unit variance. This works independently of the outer scaling. diff --git a/flexcv/interface.py b/flexcv/interface.py index e0775b4..c3c98ee 100644 --- a/flexcv/interface.py +++ b/flexcv/interface.py @@ -5,10 +5,13 @@ import logging from dataclasses import dataclass from pprint import pformat +from typing import Iterator import pandas as pd +import numpy as np from neptune.metadata_containers.run import Run as NeptuneRun from neptune.types import File +from sklearn.model_selection import BaseCrossValidator from .core import cross_validate from .metrics import MetricsDict @@ -199,8 +202,8 @@ def set_data( def set_splits( self, - split_out: str | CrossValMethod = CrossValMethod.KFOLD, - split_in: str | CrossValMethod = CrossValMethod.KFOLD, + split_out: str | CrossValMethod | BaseCrossValidator | Iterator = CrossValMethod.KFOLD, + split_in: str | CrossValMethod | BaseCrossValidator | Iterator = CrossValMethod.KFOLD, n_splits_out: int = 5, n_splits_in: int = 5, scale_out: bool = True, @@ -477,9 +480,14 @@ def _log(self): run["data/slopes_name"].log( pd.DataFrame(self.config["slopes"]).columns.tolist() ) - - run["cross_val/cross_val_method_out"].log(self.config["split_out"].value) - run["cross_val/cross_val_method_in"].log(self.config["split_in"].value) + try: + run["cross_val/cross_val_method_out"].log(self.config["split_out"].value) + except AttributeError: + run["cross_val/cross_val_method_out"].log(self.config["split_out"]) + try: + run["cross_val/cross_val_method_in"].log(self.config["split_in"].value) + except AttributeError: + run["cross_val/cross_val_method_in"].log(self.config["split_in"]) run["cross_val/n_splits_out"].log(self.config["n_splits_out"]) run["cross_val/n_splits_in"].log(self.config["n_splits_in"]) run["cross_val/scale_in"].log(self.config["scale_in"]) diff --git a/flexcv/split.py b/flexcv/split.py index f342782..246e4ef 100644 --- a/flexcv/split.py +++ b/flexcv/split.py @@ -10,9 +10,11 @@ import pandas as pd from numpy import ndarray -from sklearn.model_selection import ( +from sklearn.model_selection._split import ( BaseCrossValidator, GroupsConsumerMixin, + ) +from sklearn.model_selection import ( GroupKFold, KFold, StratifiedGroupKFold, From d8ff7ca984dd9fa54f439205186ee22264abab5c Mon Sep 17 00:00:00 2001 From: radlfabs Date: Fri, 10 Nov 2023 21:18:46 +0100 Subject: [PATCH 6/6] add tests --- test/test_interface.py | 80 ++++++++++++++++++- ...est_split_concatenated_stratified_group.py | 42 ++++++++++ test/test_split_continuous_stratified.py | 40 ++++++++++ .../test_split_continuous_stratified_group.py | 42 ++++++++++ test/test_splits_custom_strat.py | 30 ------- test/test_splits_custom_strat_group.py | 30 ------- test/test_splits_make_cross_val_split.py | 64 +++++++++++++-- test/test_z_full_interface_fixed_effects.py | 35 +++++++- 8 files changed, 291 insertions(+), 72 deletions(-) create mode 100644 test/test_split_concatenated_stratified_group.py create mode 100644 test/test_split_continuous_stratified.py create mode 100644 test/test_split_continuous_stratified_group.py delete mode 100644 test/test_splits_custom_strat.py delete mode 100644 test/test_splits_custom_strat_group.py diff --git a/test/test_interface.py b/test/test_interface.py index c49d556..fd0ee9f 100644 --- a/test/test_interface.py +++ b/test/test_interface.py @@ -4,8 +4,8 @@ from unittest.mock import MagicMock from sklearn.ensemble import RandomForestRegressor -import pandas as pd - +from sklearn.model_selection import KFold +import numpy as np from flexcv.interface import CrossValidationResults, CrossValidation from flexcv.split import CrossValMethod @@ -157,6 +157,42 @@ def test_cross_validation_set_splits_invalid_metrics(): cv = CrossValidation() with pytest.raises(TypeError): cv.set_splits(metrics="invalid type") + +def test_set_splits_with_cross_val_method(): + cv = CrossValidation() + cv.set_splits(split_out=CrossValMethod.KFOLD, split_in=CrossValMethod.KFOLD) + assert cv.config["split_out"] == CrossValMethod.KFOLD + assert cv.config["split_in"] == CrossValMethod.KFOLD + +def test_set_splits_with_string(): + cv = CrossValidation() + cv.set_splits(split_out="KFold", split_in="KFold") + assert cv.config["split_out"] == CrossValMethod.KFOLD + assert cv.config["split_in"] == CrossValMethod.KFOLD + +def test_set_splits_with_sklearn_cross_validator(): + cv = CrossValidation() + kfold = KFold(n_splits=5) + cv.set_splits(split_out=kfold, split_in=kfold) + assert cv.config["split_out"] == kfold + assert cv.config["split_in"] == kfold + +def test_set_splits_with_iterator(): + cv = CrossValidation() + iterator = iter([np.array([1, 2, 3]), np.array([4, 5, 6])]) + cv.set_splits(split_out=iterator, split_in=iterator) + assert cv.config["split_out"] == iterator + assert cv.config["split_in"] == iterator + +def test_set_splits_with_invalid_string(): + cv = CrossValidation() + with pytest.raises(TypeError): + cv.set_splits(split_out="InvalidMethod", split_in="InvalidMethod") + +def test_set_splits_with_invalid_type(): + cv = CrossValidation() + with pytest.raises(TypeError): + cv.set_splits(split_out=123, split_in=123) def test_cross_validation_set_models_valid(): # Test set_models method with valid mapping @@ -373,3 +409,43 @@ def test_cross_validation_results_not_performed(): cv = CrossValidation() with pytest.raises(RuntimeError): cv.results + +def test_prepare_before_perform(): + # Test _prepare_before_perform method + cv = CrossValidation() + cv.config["split_out"] = "kfold" + cv.config["split_in"] = "group" + cv.config["mapping"] = ModelMappingDict({ + "RandomForestRegressor": ModelConfigDict({ + "model": RandomForestRegressor, + "parameters": {"n_estimators": 100} + }) + }) + cv.config["n_trials"] = 100 + cv._prepare_before_perform() + assert cv.config["split_out"] == CrossValMethod.KFOLD + assert cv.config["split_in"] == CrossValMethod.GROUP + assert isinstance(cv.config["run"], DummyRun) + assert cv.config["mapping"]["RandomForestRegressor"]["n_trials"] == 100 + +def test_prepare_before_perform_with_run(): + # Test _prepare_before_perform method with a run already set + cv = CrossValidation() + cv.config["run"] = NeptuneRun() + cv._prepare_before_perform() + assert isinstance(cv.config["run"], NeptuneRun) + +def test_prepare_before_perform_with_n_trials(): + # Test _prepare_before_perform method with n_trials already set in mapping + cv = CrossValidation() + cv.config["mapping"] = ModelMappingDict({ + "RandomForestRegressor": ModelConfigDict({ + "model": RandomForestRegressor, + "parameters": {"n_estimators": 100}, + "n_trials": 50 + }) + }) + cv.config["n_trials"] = 100 + cv._prepare_before_perform() + assert cv.config["mapping"]["RandomForestRegressor"]["n_trials"] == 50 + diff --git a/test/test_split_concatenated_stratified_group.py b/test/test_split_concatenated_stratified_group.py new file mode 100644 index 0000000..dfdea22 --- /dev/null +++ b/test/test_split_concatenated_stratified_group.py @@ -0,0 +1,42 @@ +from flexcv.stratification import ConcatenatedStratifiedKFold +from sklearn.datasets import make_regression +import numpy as np +import pandas as pd + +def test_concatenated_stratified_kfold_init(): + # Test initialization + cv = ConcatenatedStratifiedKFold(n_splits=5, shuffle=True, random_state=42) + assert isinstance(cv, ConcatenatedStratifiedKFold) + assert cv.n_splits == 5 + assert cv.shuffle == True + assert cv.random_state == 42 + +def test_concatenated_stratified_kfold_split(): + # Test split method + X, y = make_regression(n_samples=100, n_features=20, random_state=42) + groups = np.random.choice([1, 2, 3], size=100) + cv = ConcatenatedStratifiedKFold(n_splits=5, shuffle=True, random_state=42) + splits = list(cv.split(X, y, groups)) + # Check that the correct number of splits are returned + assert len(splits) == 5 + # Check that the training and test sets are disjoint + for train_index, test_index in splits: + assert len(set(train_index) & set(test_index)) == 0 + +def test_concatenated_stratified_kfold_split_with_series(): + # Test split method with y as a Series + X, y = make_regression(n_samples=100, n_features=20, random_state=42) + y = pd.Series(y) + groups = np.random.choice([1, 2, 3], size=100) + cv = ConcatenatedStratifiedKFold(n_splits=5, shuffle=True, random_state=42) + splits = list(cv.split(X, y, groups)) + # Check that the correct number of splits are returned + assert len(splits) == 5 + # Check that the training and test sets are disjoint + for train_index, test_index in splits: + assert len(set(train_index) & set(test_index)) == 0 + +def test_concatenated_stratified_kfold_get_n_splits(): + # Test get_n_splits method + cv = ConcatenatedStratifiedKFold(n_splits=5, shuffle=True, random_state=42) + assert cv.get_n_splits() == 5 \ No newline at end of file diff --git a/test/test_split_continuous_stratified.py b/test/test_split_continuous_stratified.py new file mode 100644 index 0000000..504b1c3 --- /dev/null +++ b/test/test_split_continuous_stratified.py @@ -0,0 +1,40 @@ +from flexcv.stratification import ContinuousStratifiedKFold +from sklearn.datasets import make_regression +import numpy as np +import pandas as pd + +def test_continuous_stratified_kfold_init(): + # Test initialization + cv = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=42) + assert isinstance(cv, ContinuousStratifiedKFold) + assert cv.n_splits == 5 + assert cv.shuffle == True + assert cv.random_state == 42 + +def test_continuous_stratified_kfold_split(): + # Test split method + X, y = make_regression(n_samples=100, n_features=20, random_state=42) + cv = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=42) + splits = list(cv.split(X, y)) + # Check that the correct number of splits are returned + assert len(splits) == 5 + # Check that the training and test sets are disjoint + for train_index, test_index in splits: + assert len(set(train_index) & set(test_index)) == 0 + +def test_continuous_stratified_kfold_split_with_series(): + # Test split method with y as a Series + X, y = make_regression(n_samples=100, n_features=20, random_state=42) + y = pd.Series(y) + cv = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=42) + splits = list(cv.split(X, y)) + # Check that the correct number of splits are returned + assert len(splits) == 5 + # Check that the training and test sets are disjoint + for train_index, test_index in splits: + assert len(set(train_index) & set(test_index)) == 0 + +def test_continuous_stratified_kfold_get_n_splits(): + # Test get_n_splits method + cv = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=42) + assert cv.get_n_splits() == 5 \ No newline at end of file diff --git a/test/test_split_continuous_stratified_group.py b/test/test_split_continuous_stratified_group.py new file mode 100644 index 0000000..e421125 --- /dev/null +++ b/test/test_split_continuous_stratified_group.py @@ -0,0 +1,42 @@ +from flexcv.stratification import ContinuousStratifiedGroupKFold +from sklearn.datasets import make_regression +import numpy as np +import pandas as pd + +def test_continuous_stratified_group_kfold_init(): + # Test initialization + cv = ContinuousStratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42) + assert isinstance(cv, ContinuousStratifiedGroupKFold) + assert cv.n_splits == 5 + assert cv.shuffle == True + assert cv.random_state == 42 + +def test_continuous_stratified_group_kfold_split(): + # Test split method + X, y = make_regression(n_samples=100, n_features=20, random_state=42) + groups = np.random.choice([1, 2, 3], size=100) + cv = ContinuousStratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42) + splits = list(cv.split(X, y, groups)) + # Check that the correct number of splits are returned + assert len(splits) == 5 + # Check that the training and test sets are disjoint + for train_index, test_index in splits: + assert len(set(train_index) & set(test_index)) == 0 + +def test_continuous_stratified_group_kfold_split_with_series(): + # Test split method with y as a Series + X, y = make_regression(n_samples=100, n_features=20, random_state=42) + y = pd.Series(y) + groups = np.random.choice([1, 2, 3], size=100) + cv = ContinuousStratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42) + splits = list(cv.split(X, y, groups)) + # Check that the correct number of splits are returned + assert len(splits) == 5 + # Check that the training and test sets are disjoint + for train_index, test_index in splits: + assert len(set(train_index) & set(test_index)) == 0 + +def test_continuous_stratified_group_kfold_get_n_splits(): + # Test get_n_splits method + cv = ContinuousStratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42) + assert cv.get_n_splits() == 5 \ No newline at end of file diff --git a/test/test_splits_custom_strat.py b/test/test_splits_custom_strat.py deleted file mode 100644 index d1a29c9..0000000 --- a/test/test_splits_custom_strat.py +++ /dev/null @@ -1,30 +0,0 @@ -from flexcv.split import CustomStratifiedKFold -from sklearn.datasets import make_classification -import numpy as np -import pandas as pd - -def test_custom_stratified_kfold_init(): - # Test initialization - cv = CustomStratifiedKFold(n_splits=5, shuffle=True, random_state=42) - assert isinstance(cv, CustomStratifiedKFold) - assert cv.n_splits == 5 - assert cv.shuffle == True - assert cv.random_state == 42 - -def test_custom_stratified_kfold_split(): - # Test split method - X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42) - groups = np.random.choice([1, 2, 3], size=100) - cv = CustomStratifiedKFold(n_splits=5, shuffle=True, random_state=42) - splits = list(cv.split(X, y, groups)) - # Check that the correct number of splits are returned - assert len(splits) == 5 - # Check that the training and test sets are disjoint - for train_index, test_index in splits: - assert len(set(train_index) & set(test_index)) == 0 - -def test_custom_stratified_kfold_get_n_splits(): - # Test get_n_splits method - cv = CustomStratifiedKFold(n_splits=5, shuffle=True, random_state=42) - X = np.random.rand(100, 20) - assert cv.get_n_splits(X) == 5 \ No newline at end of file diff --git a/test/test_splits_custom_strat_group.py b/test/test_splits_custom_strat_group.py deleted file mode 100644 index 3463023..0000000 --- a/test/test_splits_custom_strat_group.py +++ /dev/null @@ -1,30 +0,0 @@ -from flexcv.split import CustomStratifiedGroupKFold -from sklearn.datasets import make_classification -import numpy as np -import pandas as pd - -def test_custom_stratified_group_kfold_init(): - # Test initialization - cv = CustomStratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42) - assert isinstance(cv, CustomStratifiedGroupKFold) - assert cv.n_splits == 5 - assert cv.shuffle == True - assert cv.random_state == 42 - -def test_custom_stratified_group_kfold_split(): - # Test split method - X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42) - groups = np.random.choice([1, 2, 3], size=100) - cv = CustomStratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42) - splits = list(cv.split(X, y, groups)) - # Check that the correct number of splits are returned - assert len(splits) == 5 - # Check that the training and test sets are disjoint - for train_index, test_index in splits: - assert len(set(train_index) & set(test_index)) == 0 - -def test_custom_stratified_group_kfold_get_n_splits(): - # Test get_n_splits method - cv = CustomStratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42) - X = np.random.rand(100, 20) - assert cv.get_n_splits(X) == 5 \ No newline at end of file diff --git a/test/test_splits_make_cross_val_split.py b/test/test_splits_make_cross_val_split.py index a8ee9f8..13f215d 100644 --- a/test/test_splits_make_cross_val_split.py +++ b/test/test_splits_make_cross_val_split.py @@ -31,7 +31,7 @@ def test_make_cross_val_split_group(): def test_make_cross_val_split_stratgroup(): # Test make_cross_val_split function with StratGroupKFold X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42) - groups = np.random.choice([1, 2, 3], size=100) + groups = np.random.choice([1, 2, 3, 4, 5], size=100) cv = make_cross_val_split(method=CrossValMethod.STRATGROUP, n_splits=5, groups=groups, random_state=42) splits = list(cv(X, y)) # Check that the correct number of splits are returned @@ -40,11 +40,11 @@ def test_make_cross_val_split_stratgroup(): for train_index, test_index in splits: assert len(set(train_index) & set(test_index)) == 0 -def test_make_cross_val_split_customstratgroup(): +def test_make_cross_val_split_contistrat(): # Test make_cross_val_split function with CustomStratGroupKFold X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42) groups = np.random.choice([1, 2, 3, 4, 5], size=100) - cv = make_cross_val_split(method=CrossValMethod.CUSTOMSTRATGROUP, n_splits=5, groups=groups, random_state=42) + cv = make_cross_val_split(method=CrossValMethod.CONTISTRAT, n_splits=5, groups=groups, random_state=42) splits = list(cv(X, y)) # Check that the correct number of splits are returned assert len(splits) == 5 @@ -52,11 +52,23 @@ def test_make_cross_val_split_customstratgroup(): for train_index, test_index in splits: assert len(set(train_index) & set(test_index)) == 0 -def test_make_cross_val_split_customstrat(): +def test_make_cross_val_split_contigroup(): # Test make_cross_val_split function with CustomStratifiedKFold X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42) - groups = np.random.choice([1, 2, 3], size=100) - cv = make_cross_val_split(method=CrossValMethod.CUSTOMSTRAT, n_splits=5, groups=groups, random_state=42) + groups = np.random.choice([1, 2, 3, 4, 5], size=100) + cv = make_cross_val_split(method=CrossValMethod.CONTISTRATGROUP, n_splits=5, groups=groups, random_state=42) + splits = list(cv(X, y)) + # Check that the correct number of splits are returned + assert len(splits) == 5 + # Check that the training and test sets are disjoint + for train_index, test_index in splits: + assert len(set(train_index) & set(test_index)) == 0 + +def test_make_cross_val_split_contigroup(): + # Test make_cross_val_split function with CustomStratifiedKFold + X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42) + groups = np.random.choice([1, 2, 3, 4, 5], size=100) + cv = make_cross_val_split(method=CrossValMethod.CONTISTRATGROUP, n_splits=5, groups=groups, random_state=42) splits = list(cv(X, y)) # Check that the correct number of splits are returned assert len(splits) == 5 @@ -71,4 +83,42 @@ def test_make_cross_val_split_invalid_method(): except TypeError: pass else: - assert False, "Expected TypeError" \ No newline at end of file + assert False, "Expected TypeError" + +from flexcv.split import make_cross_val_split, CrossValMethod +from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold +from sklearn.datasets import make_classification +import pandas as pd +import numpy as np +import pytest + + +def test_make_cross_val_split_group_is_callable(): + # Test GroupKFold + groups = pd.Series(np.random.choice([1, 2, 3], size=100)) + split_func = make_cross_val_split(groups=groups, method=CrossValMethod.GROUP) + assert callable(split_func) + +def test_make_cross_val_split_invalid_method_raises(): + # Test invalid method + with pytest.raises(TypeError): + make_cross_val_split(groups=None, method="InvalidMethod") + +def test_make_cross_val_split_with_iterator(): + # Test with iterator + iterator = iter([np.array([1, 2, 3]), np.array([4, 5, 6])]) + split_func = make_cross_val_split(groups=None, method=iterator) + assert split_func == iterator + +def test_make_cross_val_split_with_cross_validator(): + # Test with cross validator + kfold = KFold(n_splits=5) + split_func = make_cross_val_split(groups=None, method=kfold) + assert split_func == kfold.split + +def test_make_cross_val_split_with_groups_consumer(): + # Test with groups consumer + gkf = GroupKFold(n_splits=5) + groups = pd.Series(np.random.choice([1, 2, 3], size=100)) + split_func = make_cross_val_split(groups=groups, method=gkf) + assert callable(split_func) \ No newline at end of file diff --git a/test/test_z_full_interface_fixed_effects.py b/test/test_z_full_interface_fixed_effects.py index c942629..60f2b0d 100644 --- a/test/test_z_full_interface_fixed_effects.py +++ b/test/test_z_full_interface_fixed_effects.py @@ -1,7 +1,7 @@ import numpy as np import optuna from sklearn.ensemble import RandomForestRegressor - +from sklearn.model_selection import KFold import flexcv.model_postprocessing as mp from flexcv.synthesizer import generate_regression from flexcv.interface import CrossValidation @@ -38,8 +38,37 @@ def simple_regression(): return np.mean(results["LinearModel"]["folds_by_metrics"]["r2"]) -def test_linear_model(): - assert np.isclose([simple_regression()], [0.4265339487499462]) +def set_splits_input_kfold_with_linear_model(): + X, y, _, _ = generate_regression( + 10, 100, n_slopes=1, noise_level=9.1e-2, random_seed=42 + ) + kfold = KFold(n_splits=5, random_state=42, shuffle=True) + model_map = ModelMappingDict( + { + "LinearModel": ModelConfigDict( + { + "model": LinearModel, + "requires_formula": True, + } + ), + } + ) + + cv = CrossValidation() + results = ( + cv.set_data(X, y) + .set_models(model_map) + .set_splits(kfold) + .set_run(Run()) + .perform() + .get_results() + ) + + return np.mean(results["LinearModel"]["folds_by_metrics"]["r2"]) + + +def test_set_splits_input_kfold_with_linear_model(): + assert np.isclose([set_splits_input_kfold_with_linear_model()], [0.4265339487499462]) def random_forest_regression():