Skip to content

Commit

Permalink
update stepwise
Browse files Browse the repository at this point in the history
  • Loading branch information
YpLarryWang committed Mar 17, 2022
1 parent 1e83606 commit 4cc9789
Showing 1 changed file with 26 additions and 18 deletions.
44 changes: 26 additions & 18 deletions stepwise_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from statsmodels.formula.api import ols


def stepwiseSelection(data, tag,
initial_list=list(),
threshold_in=0.05,
threshold_out=0.10,
verbose=True):
def stepwiseSelection(data,
tag,
initial_list=None,
threshold_in=0.05,
threshold_out=0.10,
verbose=True):
'''
data: <Pandas.DataFrame> contaning indep/dep variables
tag: <String> name of target
Expand All @@ -25,63 +26,70 @@ def stepwiseSelection(data, tag,
if mark in col:
ab_dic[col.replace(mark, '')] = col
data.rename(columns={col: col.replace(mark, '')}, inplace=True)

included = initial_list

included = initial_list if initial_list else []
excluded = data.columns.tolist()
excluded.remove(tag)
formula = f'{tag}~1' # set a constant model as initial reduced_model
best_r2_dif = .0

while True: # end loop when no variable gets in/out

excluded = list(set(data.columns) - set(included))
excluded.remove(tag)
changed = False

full_model = ols(formula=formula, data=data).fit()
last_adj_r2 = full_model.rsquared_adj

# forward step
for new_feature in excluded:

# Note here the test_model has more variable than full_model
test_model = ols(formula=formula+f'+{new_feature}', data=data).fit()
test_model = ols(
formula=formula+f'+{new_feature}', data=data).fit()

# find feature whose contribution to adj_r2 largest
if test_model.rsquared_adj - last_adj_r2 > best_r2_dif:
best_r2_dif = test_model.rsquared_adj - last_adj_r2
last_adj_r2 = test_model.rsquared_adj
best_feature = new_feature

# Partial F-test
# Note that in anova_lm models with few variables are put forward
full_model_pro = ols(formula=formula+f'+{best_feature}', data=data).fit()
full_model_pro = ols(
formula=formula+f'+{best_feature}', data=data).fit()
anova_tbl = sm.stats.anova_lm(full_model, full_model_pro)
criterion = anova_tbl['Pr(>F)'][1]

if criterion <= threshold_in:
included.append(best_feature)
excluded.remove(best_feature)
formula += f'+{best_feature}'
full_model = full_model_pro
changed = True
best_r2_dif = .0
if verbose:
print('Add {:25} with f_pvalue {:.6}'.format(best_feature, criterion))
print('Add {:25} with f_pvalue {:.6}'.format(
best_feature, criterion))

# backward step
for old_feature in included:
test_model = ols(formula=formula.replace(f'+{old_feature}', ''), data=data).fit()
test_model = ols(formula=formula.replace(
f'+{old_feature}', ''), data=data).fit()

# Note here the test_model has less variable than full_model
anova_tbl = sm.stats.anova_lm(test_model, full_model)
criterion = anova_tbl['Pr(>F)'][1]
if criterion >= threshold_out:
included.remove(old_feature)
excluded.append(old_feature)
formula = formula.replace(f'+{old_feature}', '')
changed = True
best_r2_dif = .0
if verbose:
print('Drop {:25} with f_pvalue {:.6}'.format(old_feature, criterion))

print('Drop {:25} with f_pvalue {:.6}'.format(
old_feature, criterion))

if not changed:
break
break

return [ab_dic[x] if x in ab_dic.keys() else x for x in included]

0 comments on commit 4cc9789

Please sign in to comment.