Skip to content

Commit

Permalink
xgboost model for ab testing
Browse files Browse the repository at this point in the history
  • Loading branch information
tigist13 committed Sep 3, 2022
1 parent 49988c8 commit 320a2f4
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 16 deletions.
27 changes: 11 additions & 16 deletions scripts/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,36 +97,31 @@ def confusion_matrix(self, pred, actual):
return metrics.confusion_matrix(pred, actual)

def calculate_p_values(self):
"""
Calcualting p_values for logestic regression.
code refered from the following link
https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d

"""
# X =
denom = (2.0*(1.0+np.cosh(self.clf.decision_function(X))))
denom = np.tile(denom,(X.shape[1],1)).T
F_ij = np.dot((X/denom).T,X) ## Fisher Information Matrix
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
d = (2.0*(1.0+np.cosh(self.clf.decision_function(X))))
d = np.tile(d,(X.shape[1],1)).T
F_im = np.dot((X/denom).T,X) ## Fisher Information Matrix
Cramer_Rao = np.linalg.inv(F_im) ## Inverse Information Matrix
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
z_scores = self.clf.coef_[0]/sigma_estimates # z-score
p_values = [stat.norm.sf(abs(x)) for x in z_scores] ### two tailed test for p-values

p_df = pd.DataFrame()
p_df['features'] = self.X_train.columns.to_list()
p_df['p_values'] = p_values
p_value_df = pd.DataFrame()
p_value_df['features'] = self.X_train.columns.to_list()
p_value_df['p_values'] = p_values

return p_df
return p_value_df

def plot_pvalues(self, p_df):
def plot_pvalues(self, p_value_df):

fig, ax = plt.subplots(figsize=(12,7))

ax.plot([0.05,0.05], [0.05,5])
sns.scatterplot(data=p_df, y='features', x='p_values', color="green")
sns.scatterplot(data=p_value_df, y='features', x='p_values', color="green")
plt.title("P values of features", size=20)

plt.xticks(np.arange(0,max(p_df['p_values']) + 0.05, 0.05))
plt.xticks(np.arange(0,max(p_value_df['p_values']) + 0.05, 0.05))

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
Expand Down
111 changes: 111 additions & 0 deletions scripts/xgboost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 3 07:23:23 2022
@author: user
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

#LOSS FUNCTION
def calculate_loss_function(actual, pred):
rootmeansquareerror = np.sqrt(mean_squared_error(actual, pred))
return rootmeansquareerror

class XgboostModel:
def __init__(self, X_train, X_test, y_train, y_test):

self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test

self.clf = XGBClassifier(n_estimators=1000, learning_rate=0.05)


def train_model(self, folds=1):

kf = KFold(n_splits = folds)

iterator = kf.split(self.X_train)

accuracy_arr = []
loss_arr = []

for i in range(folds):
train_index, valid_index = next(iterator)

X_train, y_train = self.X_train.iloc[train_index], self.y_train.iloc[train_index]
X_valid, y_valid = self.X_train.iloc[valid_index], self.y_train.iloc[valid_index]

self.clf = self.clf.fit(X_train, y_train, early_stopping_rounds=5, eval_set = [(X_valid, y_valid)], verbose=False )

vali_pred = self.clf.predict(X_valid)

accuracy = self.calculate_score(y_valid, vali_pred)
loss = calculate_loss_function(y_valid, vali_pred)

self.__printAccuracy(accuracy, i, label="Validation")
self.__printLoss(loss, i, label="Validation")
print()

accuracy_arr.append(accuracy)
loss_arr.append(loss)


return self.clf, accuracy_arr, loss_arr

def test_model(self):
y_pred = self.clf.predict(self.X_test)

accuracy = self.calculate_score(self.y_test, y_pred)
self.__printAccuracy(accuracy, label="Test")

report = self.report(y_pred, self.y_test)
matrix = self.confusion_matrix(y_pred, self.y_test)
loss = calculate_loss_function(self.y_test, y_pred)


return accuracy, loss, report, matrix

def __printLoss(self, loss, step=1, label=""):
print(f"step {step}: {label} Loss of xgboostModel is: {loss:.3f}")

def calculate_score(self, pred, actual):
return metrics.accuracy_score(actual, pred)

def __printAccuracy(self, acc, step=1, label=""):
print(f"step {step}: {label} Accuracy of XgboostModel is: {acc:.3f}")

def report_outcome(self, pred, actual):
print("Test Metrics")
print("================")
print(metrics.classification_report(pred, actual))
return metrics.classification_report(pred, actual)

def get_feature_importance(self):
importance = self.clf.feature_importances_
featureimportance_df = pd.DataFrame()

featureimportance_df['feature'] = self.X_train.columns.to_list()
featureimportance_df['feature_importances'] = importance

return featureimportance_df

def confusion_matrix(self, pred, actual):
ax=sns.heatmap(pd.DataFrame(metrics.confusion_matrix(pred, actual)))
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
return metrics.confusion_matrix(pred, actual)

0 comments on commit 320a2f4

Please sign in to comment.