Skip to content

Commit

Permalink
better handling of None as a cols param
Browse files Browse the repository at this point in the history
  • Loading branch information
Will McGinnis committed May 31, 2016
1 parent 2cc2cc1 commit b921bc7
Show file tree
Hide file tree
Showing 10 changed files with 85 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ v1.1.0
======

* Optionally pass drop_invariant to any encoder to consistently drop columns with 0 variance from the output (based on training set data in fit())
* If None is passed as the cols param, every string column will be encoded (pandas type = object).

v1.0.5
======

Expand Down
11 changes: 11 additions & 0 deletions category_encoders/backward_difference.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.base import BaseEstimator, TransformerMixin
from patsy.highlevel import dmatrix
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.utils import get_obj_cols

__author__ = 'willmcginnis'

Expand Down Expand Up @@ -71,8 +72,18 @@ def fit(self, X, y=None, **kwargs):
:return:
"""

# if the input dataset isn't already a dataframe, convert it to one (using default column names)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)

# train an ordinal pre-encoder
self.ordinal_encoder = self.ordinal_encoder.fit(X)

# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
Expand Down
11 changes: 11 additions & 0 deletions category_encoders/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.utils import get_obj_cols

__author__ = 'willmcginnis'

Expand Down Expand Up @@ -77,8 +78,18 @@ def fit(self, X, y=None, **kwargs):
:return:
"""

# if the input dataset isn't already a dataframe, convert it to one (using default column names)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)

# train an ordinal pre-encoder
self.ordinal_encoder = self.ordinal_encoder.fit(X)

# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
Expand Down
10 changes: 10 additions & 0 deletions category_encoders/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import copy
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from category_encoders.utils import get_obj_cols

__author__ = 'willmcginnis'

Expand Down Expand Up @@ -100,6 +101,15 @@ def fit(self, X, y=None, **kwargs):
:return:
"""

# if the input dataset isn't already a dataframe, convert it to one (using default column names)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)

# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
Expand Down
1 change: 1 addition & 0 deletions category_encoders/helmert.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.base import BaseEstimator, TransformerMixin
from patsy.highlevel import dmatrix
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.utils import get_obj_cols

__author__ = 'willmcginnis'

Expand Down
8 changes: 8 additions & 0 deletions category_encoders/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import copy
from sklearn.base import BaseEstimator, TransformerMixin
import random
from category_encoders.utils import get_obj_cols

__author__ = 'willmcginnis'

Expand All @@ -34,6 +35,7 @@ def ordinal_encoding(X_in, mapping=None, cols=None):
for switch in mapping:
for category in switch.get('mapping'):
X.loc[X[switch.get('col')] == category[0], switch.get('col')] = str(category[1])

X[switch.get('col')] = X[switch.get('col')].astype(int).reshape(-1, )
else:
for col in cols:
Expand Down Expand Up @@ -78,12 +80,18 @@ def fit(self, X, y=None, **kwargs):
:return:
"""

# if the input dataset isn't already a dataframe, convert it to one (using default column names)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)

_, categories = ordinal_encoding(X, mapping=self.mapping, cols=self.cols)
self.mapping = categories

# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
Expand Down
11 changes: 10 additions & 1 deletion category_encoders/polynomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from patsy.highlevel import dmatrix
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.utils import get_obj_cols

__author__ = 'willmcginnis'

Expand Down Expand Up @@ -68,6 +68,15 @@ def fit(self, X, y=None, **kwargs):
:return:
"""

# if the input dataset isn't already a dataframe, convert it to one (using default column names)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)

# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
Expand Down
13 changes: 13 additions & 0 deletions category_encoders/sum_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.base import BaseEstimator, TransformerMixin
from patsy.highlevel import dmatrix
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.utils import get_obj_cols

__author__ = 'willmcginnis'

Expand Down Expand Up @@ -69,8 +70,20 @@ def fit(self, X, y=None, **kwargs):
:return:
"""

# if the input dataset isn't already a dataframe, convert it to one (using default column names)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)

self.ordinal_encoder = self.ordinal_encoder.fit(X)

# train an ordinal pre-encoder
self.ordinal_encoder = self.ordinal_encoder.fit(X)

# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
Expand Down
18 changes: 18 additions & 0 deletions category_encoders/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
.. module:: utils
:synopsis: generic helper functions shared by many encoders
:platform:
"""

__author__ = 'willmcginnis'


def get_obj_cols(df):
obj_cols = []
for idx, dt in enumerate(df.dtypes):
if dt == 'object':
obj_cols.append(df.columns.values[idx])

return obj_cols
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from codecs import open
from os import path

__version__ = '1.0.5'
__version__ = '1.1.0'


here = path.abspath(path.dirname(__file__))
Expand Down

0 comments on commit b921bc7

Please sign in to comment.