Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow arbitrary cross validation fold indices #3353

Merged
merged 2 commits into from
Jun 30, 2018
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
allow arbitrary cross validation fold indices
 - use training indices passed to `folds` parameter in `training.cv`
 - update doc string
  • Loading branch information
Oliver Laslett committed Jun 4, 2018
commit 97be08db04acc37f43121af665a077c8fee3cbe5
40 changes: 31 additions & 9 deletions python-package/xgboost/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,22 +231,39 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
np.random.seed(seed)

if stratified is False and folds is None:
# Do standard k-fold cross validation
if shuffle is True:
idx = np.random.permutation(dall.num_row())
else:
idx = np.arange(dall.num_row())
idset = np.array_split(idx, nfold)
elif folds is not None and isinstance(folds, list):
idset = [x[1] for x in folds]
nfold = len(idset)
out_idset = np.array_split(idx, nfold)
in_idset = [
np.concatenate([out_idset[i] for i in range(nfold) if k != i])
for k in range(nfold)
]
elif folds is not None:
# Use user specified custom split using indices
try:
in_idset = [x[0] for x in folds]
out_idset = [x[1] for x in folds]
except TypeError:
# Custom stratification using Sklearn KFoldSplit object
splits = list(folds.split(X=dall.get_label(), y=dall.get_label()))
in_idset = [x[0] for x in splits]
out_idset = [x[1] for x in splits]
nfold = len(out_idset)
else:
# Do standard stratefied shuffle k-fold split
sfk = XGBStratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
idset = [x[1] for x in sfk.split(X=dall.get_label(), y=dall.get_label())]
splits = list(sfk.split(X=dall.get_label(), y=dall.get_label()))
in_idset = [x[0] for x in splits]
out_idset = [x[1] for x in splits]
nfold = len(out_idset)

ret = []
for k in range(nfold):
dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
dtest = dall.slice(idset[k])
dtrain = dall.slice(in_idset[k])
dtest = dall.slice(out_idset[k])
# run preprocessing on the data set if needed
if fpreproc is not None:
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
Expand Down Expand Up @@ -308,8 +325,13 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
Number of folds in CV.
stratified : bool
Perform stratified sampling.
folds : a KFold or StratifiedKFold instance
Sklearn KFolds or StratifiedKFolds.
folds : a KFold or StratifiedKFold instance or list of fold indices
Sklearn KFolds or StratifiedKFolds object.
Alternatively may explicitly pass sample indices for each fold.
For `n` folds, `folds` should be a length `n` list of tuples.
Each tuple is `(in,out)` where `in` is a list of indices to be used
as the training samples for the `n`th fold and `out` is a list of
indices to be used as the testing samples for the `n`th fold.
metrics : string or list of strings
Evaluation metrics to be watched in CV.
obj : function
Expand Down