Skip to content

Commit

Permalink
remove nan metamorphic test because metamorphic behaviour is removed …
Browse files Browse the repository at this point in the history
…from ordinal encoder
  • Loading branch information
PaulWestenthanner committed Jan 24, 2023
1 parent d19f69e commit ac0fb56
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 23 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
unreleased
==========

* added: ignore option for one-hot-encoding
* fixed: external dependency in unit test
* fixed: gaps in ordinal encoding if nan values are present
* fixed: sklearn complicance: add `feature_names_in_` attribute
* fixed: add RankHotEncoder in documentation
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/rankhot.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def generate_mapping(self):
index = []
new_columns = []

for cat_name, class_ in values.iteritems():
for cat_name, class_ in values.items():
if self.use_cat_names:
n_col_name = f"{col}_{cat_name}"
found_count = found_column_counts.get(n_col_name, 0)
Expand Down
31 changes: 9 additions & 22 deletions tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,20 +439,14 @@ def test_duplicate_index_value(self):
self.assertEqual(5, len(result))

def test_string_index(self):
# https://github.com/scikit-learn-contrib/categorical-encoding/issues/131

bunch = sklearn.datasets.fetch_openml(name="house_prices", as_frame=True)
y = (bunch.target > 200000).values
X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
X.index = X.index.values.astype(str)

display_cols = ["Id", "MSSubClass", "MSZoning", "YearBuilt", "Heating", "CentralAir"]
X = X[display_cols]
train = pd.DataFrame({'city': ['chicago', 'denver']})
target = [0, 1]
train.index = train.index.values.astype(str)

for encoder_name in encoders.__all__:
with self.subTest(encoder_name=encoder_name):
enc = getattr(encoders, encoder_name)(cols=['CentralAir', 'Heating'])
result = enc.fit_transform(X, y)
enc = getattr(encoders, encoder_name)()
result = enc.fit_transform(train, target)
self.assertFalse(result.isnull().values.any(), 'There should not be any missing value!')

def test_get_feature_names_out(self):
Expand Down Expand Up @@ -609,8 +603,7 @@ def test_metamorphic(self):
x3 = pd.DataFrame(data={'x': ['A', 'B', 'B']}) # DataFrame
x4 = pd.Series(['A', 'B', 'B'], dtype='category') # Series with category data type
x5 = np.array(['A', 'B', 'B']) # Numpy
x6 = [np.NaN, 'B', 'B'] # Missing value
x7 = ['Z', 'Y', 'Y'] # Different strings, reversed alphabetic ordering (it works because we look at the order of appearance, not at alphabetic order)
x6 = ['Z', 'Y', 'Y'] # Different strings, reversed alphabetic ordering (it works because we look at the order of appearance, not at alphabetic order)

y = [1, 1, 0]

Expand All @@ -636,18 +629,12 @@ def test_metamorphic(self):
result5 = enc5.fit_transform(x5, y)
self.assertTrue((result1.values == result5.values).all())

# gray encoder and rankhot and ordinal re-orders inputs so that nan is last, hence the output is changed
if encoder_name not in ["GrayEncoder", "RankHotEncoder", "OrdinalEncoder"]:
enc6 = getattr(encoders, encoder_name)()
result6 = enc6.fit_transform(x6, y)
self.assertTrue((result1.values == result6.values).all())

# gray encoder actually does re-order inputs
# rankhot encoder respects order, in this example the order is switched
if encoder_name not in ["GrayEncoder", "RankHotEncoder"]:
enc7 = getattr(encoders, encoder_name)()
result7 = enc7.fit_transform(x7, y)
self.assertTrue((result1.values == result7.values).all())
enc6 = getattr(encoders, encoder_name)()
result6 = enc6.fit_transform(x6, y)
self.assertTrue((result1.values == result6.values).all())

# Arguments
enc9 = getattr(encoders, encoder_name)(return_df=False)
Expand Down

0 comments on commit ac0fb56

Please sign in to comment.