Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SequentialFeatureSelection Early Stopping Criterion #886

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions mlxtend/feature_selection/sequential_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin):
n_jobs : int (default: 1)
The number of CPUs to use for evaluating different feature subsets
in parallel. -1 means 'all CPUs'.
early_stop_rounds : int (default 0)
Enable early stopping criterion when > 0, this value determines the
number of iterations after which, if no performance boost has been
seen, execution is stopped.
Used only when `k_features == 'best'` or `k_features == 'parsimonious'`
pre_dispatch : int, or string (default: '2*n_jobs')
Controls the number of jobs that get dispatched
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
Expand Down Expand Up @@ -178,6 +183,7 @@ def __init__(self, estimator, k_features=1,
forward=True, floating=False,
verbose=0, scoring=None,
cv=5, n_jobs=1,
early_stop_rounds=0,
pre_dispatch='2*n_jobs',
clone_estimator=True,
fixed_features=None):
Expand All @@ -201,6 +207,13 @@ def __init__(self, estimator, k_features=1,
self.verbose = verbose
self.clone_estimator = clone_estimator

if not isinstance(early_stop_rounds, int) or early_stop_rounds < 0:
raise ValueError('Number of early stopping round should be '
'an integer value greater than or equal to 0.'
'Got %d' % early_stop_rounds)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ...Got %d' % early_stop_rounds might not work if early_stop_rounds is not an integer. Maybe it's better to replace it with

...Got %s' % early_stop_rounds

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right!


self.early_stop_rounds = early_stop_rounds

if fixed_features is not None:
if isinstance(self.k_features, int) and \
self.k_features <= len(fixed_features):
Expand Down Expand Up @@ -424,6 +437,8 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
}
best_subset = None
k_score = 0
best_score = -np.inf
early_stop_count = self.early_stop_rounds

try:
while k != k_to_select:
Expand Down Expand Up @@ -550,6 +565,20 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
X)
raise KeyboardInterrupt

# early stop
if self.early_stop_rounds \
and k != k_to_select \
and self.k_features in {'best', 'parsimonious'}:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of the check here, i would maybe change it to raising a ValueError in the top of the function if self.k_features is not in {'best', 'parsimonious'} and self.early_stop_rounds. This way the user is aware, and we only have to perform the check once.

Copy link
Author

@aldder aldder Feb 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rasbt Do you prefer having this check on top of fit function or during initialization?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, totally lost track and missed your comment! Sorry! Regarding your question, I think fit might be better to keep it more consistent with scikit-learn behavior.

if k_score <= best_score:
early_stop_count -= 1
if early_stop_count == 0:
print('Performances not improved for %d rounds. '
'Stopping now!' % self.early_stop_rounds)
break
else:
early_stop_count = self.early_stop_rounds
best_score = k_score

except KeyboardInterrupt:
self.interrupted_ = True
sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -983,3 +983,51 @@ def test_custom_feature_names():
assert sfs1.k_feature_names_ == ('sepal width', 'petal width')
assert sfs1.subsets_[2]['feature_names'] == ('sepal width',
'petal width')


def test_run_forward_earlystop():
np.random.seed(0)
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
X_iris_with_noise = np.concatenate(
(X_iris,
np.random.randn(X_iris.shape[0], X_iris.shape[1])),
axis=1)
knn = KNeighborsClassifier()
esr = 2
sfs = SFS(estimator=knn,
k_features='best',
forward=True,
floating=False,
early_stop=True,
early_stop_rounds=esr,
verbose=0)
sfs.fit(X_iris_with_noise, y_iris)
assert len(sfs.subsets_) < X_iris_with_noise.shape[1]
assert all([sfs.subsets_[list(sfs.subsets_)[-esr-1]]['avg_score']
>= sfs.subsets_[i]['avg_score'] for i in sfs.subsets_.keys()])


def test_run_backward_earlystop():
np.random.seed(0)
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
X_iris_with_noise = np.concatenate(
(X_iris,
np.random.randn(X_iris.shape[0], X_iris.shape[1])),
axis=1)
knn = KNeighborsClassifier()
esr = 2
sfs = SFS(estimator=knn,
k_features='best',
forward=False,
floating=False,
early_stop=True,
early_stop_rounds=esr,
verbose=0)
sfs.fit(X_iris_with_noise, y_iris)
assert len(sfs.subsets_) > 1
assert all([sfs.subsets_[list(sfs.subsets_)[-esr-1]]['avg_score']
>= sfs.subsets_[i]['avg_score'] for i in sfs.subsets_.keys()])