Add label encoder for supervised transform

Remove converting to float in catboost
scikit-learn-contrib · PaulWestenthanner · Dec 3, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 30, 2023
commit c6de169f51a2c2f9fbe2f253d9ed4ac675dd8e66
diff --git a/category_encoders/cat_boost.py b/category_encoders/cat_boost.py
@@ -103,7 +103,6 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
 
     def _fit(self, X, y, **kwargs):
         X = X.copy(deep=True)
-        y = y.astype(float) #Incase y is bool or categorical.
         self._mean = y.mean()
         self.mapping = {col: self._fit_column_map(X[col], y) for col in self.cols}
 

diff --git a/category_encoders/utils.py b/category_encoders/utils.py
@@ -6,12 +6,13 @@
 import pandas as pd
 import numpy as np
 import sklearn.base
-from pandas.api.types import is_object_dtype, is_string_dtype
+from pandas.api.types import is_object_dtype, is_string_dtype, is_numeric_dtype
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.exceptions import NotFittedError
 from typing import Dict, List, Optional, Union
 from scipy.sparse import csr_matrix
+from sklearn.preprocessing import LabelEncoder
 
 __author__ = 'willmcginnis'
 
@@ -294,11 +295,18 @@ def fit(self, X, y=None, **kwargs):
             Returns self.
 
         """
-        self._check_fit_inputs(X, y)
         X, y = convert_inputs(X, y)
+        self._check_fit_inputs(X, y)
         self.feature_names_in_ = X.columns.tolist()
         self.n_features_in_ = len(self.feature_names_in_)
 
+        if self._get_tags().get('supervised_encoder'):
+            if not is_numeric_dtype(y):
+                self.lab_encoder_ = LabelEncoder()
+                y = self.lab_encoder_.fit_transform(y)
+            else:
+                self.lab_encoder_ = None
+
         self._dim = X.shape[1]
         self._determine_fit_columns(X)
 
@@ -324,8 +332,12 @@ def fit(self, X, y=None, **kwargs):
         return self
 
     def _check_fit_inputs(self, X, y):
-        if self._get_tags().get('supervised_encoder') and y is None:
-            raise ValueError('Supervised encoders need a target for the fitting. The target cannot be None')
+        if self._get_tags().get('supervised_encoder'):
+            if y is None:
+                raise ValueError('Supervised encoders need a target for the fitting. The target cannot be None')
+            else:
+                if y.isna().any(): # Target column should never have missing values
+                    raise ValueError("The target column y must not contain missing values.")
 
     def _check_transform_inputs(self, X):
         if self.handle_missing == 'error':
@@ -435,6 +447,8 @@ def transform(self, X, y=None, override_return_df=False):
         # first check the type
         X, y = convert_inputs(X, y, deep=True)
         self._check_transform_inputs(X)
+        if y is not None and self.lab_encoder_ is not None:
+            y = self.lab_encoder_.transform(y)
 
         if not list(self.cols):
             return X

diff --git a/category_encoders/woe.py b/category_encoders/woe.py
@@ -3,6 +3,7 @@
 from category_encoders.ordinal import OrdinalEncoder
 import category_encoders.utils as util
 from sklearn.utils.random import check_random_state
+import pandas as pd
 
 __author__ = 'Jan Motl'
 
@@ -87,6 +88,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
 
     def _fit(self, X, y, **kwargs):
         # The label must be binary with values {0,1}
+        y = pd.Series(y)
         unique = y.unique()
         if len(unique) != 2:
             raise ValueError("The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).")