Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SequentialFeatureSelection Early Stopping Criterion #886

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions mlxtend/feature_selection/sequential_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin):
n_jobs : int (default: 1)
The number of CPUs to use for evaluating different feature subsets
in parallel. -1 means 'all CPUs'.
early_stop_rounds : int (default 0)
Enable early stopping criterion when > 0, this value determines the
number of iterations after which, if no performance boost has been
seen, execution is stopped.
Used only when `k_features == 'best'` or `k_features == 'parsimonious'`
pre_dispatch : int, or string (default: '2*n_jobs')
Controls the number of jobs that get dispatched
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
Expand Down Expand Up @@ -174,10 +179,12 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin):
http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/

"""

def __init__(self, estimator, k_features=1,
forward=True, floating=False,
verbose=0, scoring=None,
cv=5, n_jobs=1,
early_stop_rounds=0,
pre_dispatch='2*n_jobs',
clone_estimator=True,
fixed_features=None):
Expand All @@ -201,6 +208,13 @@ def __init__(self, estimator, k_features=1,
self.verbose = verbose
self.clone_estimator = clone_estimator

if not isinstance(early_stop_rounds, int) or early_stop_rounds < 0:
raise ValueError('Number of early stopping round should be '
'an integer value greater than or equal to 0.'
'Got %s' % early_stop_rounds)

self.early_stop_rounds = early_stop_rounds

if fixed_features is not None:
if isinstance(self.k_features, int) and \
self.k_features <= len(fixed_features):
Expand Down Expand Up @@ -385,6 +399,12 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
select_in_range = False
k_to_select = self.k_features

if self.early_stop_rounds and isinstance(self.k_features, str) and\
not self.k_features in {'best', 'parsimonious'}:
raise ValueError('Early stopping is allowed only when `k_features`'
' is "best" or "parsimonious". Got'
' `k_features=%s`' % self.k_features)

orig_set = set(range(X_.shape[1]))
n_features = X_.shape[1]

Expand Down Expand Up @@ -424,6 +444,8 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
}
best_subset = None
k_score = 0
best_score = -np.inf
early_stop_count = self.early_stop_rounds

try:
while k != k_to_select:
Expand Down Expand Up @@ -550,6 +572,18 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
X)
raise KeyboardInterrupt

# early stop
if self.early_stop_rounds and k != k_to_select:
if k_score <= best_score:
early_stop_count -= 1
if early_stop_count == 0:
print('Performances not improved for %d rounds. '
'Stopping now!' % self.early_stop_rounds)
break
else:
early_stop_count = self.early_stop_rounds
best_score = k_score

except KeyboardInterrupt:
self.interrupted_ = True
sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -978,8 +978,54 @@ def test_custom_feature_names():
n_jobs=1)

sfs1 = sfs1.fit(X, y, custom_feature_names=(
'sepal length', 'sepal width', 'petal length', 'petal width'))
'sepal length', 'sepal width', 'petal length', 'petal width'))
assert sfs1.k_feature_idx_ == (1, 3)
assert sfs1.k_feature_names_ == ('sepal width', 'petal width')
assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
'petal width')


def test_run_forward_earlystop():
np.random.seed(0)
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
X_iris_with_noise = np.concatenate(
(X_iris,
np.random.randn(X_iris.shape[0], X_iris.shape[1])),
axis=1)
knn = KNeighborsClassifier()
esr = 2
sfs = SFS(estimator=knn,
k_features='best',
forward=True,
floating=False,
early_stop_rounds=esr,
verbose=0)
sfs.fit(X_iris_with_noise, y_iris)
assert len(sfs.subsets_) < X_iris_with_noise.shape[1]
assert all([sfs.k_score_ >= sfs.subsets_[i]['avg_score']
for i in sfs.subsets_])


def test_run_backward_earlystop():
np.random.seed(0)
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
X_iris_with_noise = np.concatenate(
(X_iris,
np.random.randn(X_iris.shape[0], X_iris.shape[1])),
axis=1)
knn = KNeighborsClassifier()
esr = 2
sfs = SFS(estimator=knn,
k_features='best',
forward=False,
floating=False,
early_stop_rounds=esr,
verbose=0)
sfs.fit(X_iris_with_noise, y_iris)
assert len(sfs.subsets_) > 1
assert all([sfs.k_score_ >= sfs.subsets_[i]['avg_score']
for i in sfs.subsets_])