-
Notifications
You must be signed in to change notification settings - Fork 0
/
xgb_starter.py
168 lines (127 loc) · 6.15 KB
/
xgb_starter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime
import operator
from sklearn.cross_validation import train_test_split
from collections import Counter
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from pylab import plot, show, subplot, specgram, imshow, savefig
RS = 12357
ROUNDS = 500
print("Started")
np.random.seed(RS)
input_folder = './data/input/'
def train_xgb(X, y, params):
print("Will train XGB for {} rounds, RandomSeed: {}".format(ROUNDS, RS))
x, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RS)
xg_train = xgb.DMatrix(x, label=y_train)
xg_val = xgb.DMatrix(X_val, label=y_val)
watchlist = [(xg_train, 'train'), (xg_val, 'eval')]
return xgb.train(params, xg_train, ROUNDS, watchlist)
def predict_xgb(clr, X_test):
return clr.predict(xgb.DMatrix(X_test))
def create_feature_map(features):
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
def main():
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.05
params['max_depth'] = 5
params['silent'] = 1
params['seed'] = RS
df_train = pd.read_csv(input_folder + 'train.csv')
df_test = pd.read_csv(input_folder + 'test.csv')
print("Original data: X_train: {}, X_test: {}".format(df_train.shape, df_test.shape))
print("Features processing, be patient...")
# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
return 0 if count < min_count else 1 / (count + eps)
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}
stops = set(stopwords.words("english"))
def word_shares(row):
q1 = set(str(row['question1']).lower().split())
q1words = q1.difference(stops)
if len(q1words) == 0:
return '0:0:0:0:0'
q2 = set(str(row['question2']).lower().split())
q2words = q2.difference(stops)
if len(q2words) == 0:
return '0:0:0:0:0'
q1stops = q1.intersection(stops)
q2stops = q2.intersection(stops)
shared_words = q1words.intersection(q2words)
shared_weights = [weights.get(w, 0) for w in shared_words]
total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
R1 = np.sum(shared_weights) / np.sum(total_weights) # tfidf share
R2 = len(shared_words) / (len(q1words) + len(q2words)) # count share
R31 = len(q1stops) / len(q1words) # stops in q1
R32 = len(q2stops) / len(q2words) # stops in q2
return '{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32)
df = pd.concat([df_train, df_test])
df['word_shares'] = df.apply(word_shares, axis=1, raw=True)
x = pd.DataFrame()
x['word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
x['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
x['shared_count'] = df['word_shares'].apply(lambda x: float(x.split(':')[2]))
x['stops1_ratio'] = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
x['stops2_ratio'] = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
x['diff_stops_r'] = x['stops1_ratio'] - x['stops2_ratio']
x['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
x['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
x['diff_len'] = x['len_q1'] - x['len_q2']
x['len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
x['len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2']
x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2']
# ... YOUR FEATURES HERE ...
feature_names = list(x.columns.values)
create_feature_map(feature_names)
print("Features: {}".format(feature_names))
x_train = x[:df_train.shape[0]]
x_test = x[df_train.shape[0]:]
y_train = df_train['is_duplicate'].values
del x, df_train
if 1: # Now we oversample the negative class - on your own risk of overfitting!
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]
print("Oversampling started for proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
neg_train = pd.concat([neg_train, neg_train])
scale -= 1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print("Oversampling done, new proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))
x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train
print("Training data: X_train: {}, Y_train: {}, X_test: {}".format(x_train.shape, len(y_train), x_test.shape))
clr = train_xgb(x_train, y_train, params)
preds = predict_xgb(clr, x_test)
print("Writing output...")
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = preds
sub.to_csv("xgb_seed_starter_{}_n{}.csv".format(RS, ROUNDS), index=False)
print("Features importances...")
importance = clr.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
ft = pd.DataFrame(importance, columns=['feature', 'fscore'])
ft.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 25))
plt.gcf().savefig('features_importance.png')
main()
print("Done.")