From 0be706b58bbd8222f970398e2dd68657b71ccbae Mon Sep 17 00:00:00 2001 From: Adam Wisniewski Date: Wed, 1 Jun 2016 00:48:33 +0000 Subject: [PATCH] working on P2 --- .../student_intervention.ipynb | 177 ++++++++++++------ 1 file changed, 123 insertions(+), 54 deletions(-) diff --git a/projects/student_intervention/student_intervention.ipynb b/projects/student_intervention/student_intervention.ipynb index 722459916f..e3fa2115a3 100644 --- a/projects/student_intervention/student_intervention.ipynb +++ b/projects/student_intervention/student_intervention.ipynb @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -347,7 +347,7 @@ "text": [ "Training DecisionTreeClassifier...\n", "Done!\n", - "Training time (secs): 0.007\n" + "Training time (secs): 0.004\n" ] } ], @@ -417,8 +417,8 @@ "text": [ "Predicting labels using DecisionTreeClassifier...\n", "Done!\n", - "Prediction time (secs): 0.001\n", - "F1 score for test set: 0.759124087591\n" + "Prediction time (secs): 0.000\n", + "F1 score for test set: 0.682170542636\n" ] } ], @@ -445,12 +445,12 @@ "Training time (secs): 0.002\n", "Predicting labels using DecisionTreeClassifier...\n", "Done!\n", - "Prediction time (secs): 0.001\n", + "Prediction time (secs): 0.003\n", "F1 score for training set: 1.0\n", "Predicting labels using DecisionTreeClassifier...\n", "Done!\n", "Prediction time (secs): 0.000\n", - "F1 score for test set: 0.672413793103\n", + "F1 score for test set: 0.632478632479\n", "------------------------------------------\n", "Training set size: 200\n", "Training DecisionTreeClassifier...\n", @@ -463,7 +463,7 @@ "Predicting labels using DecisionTreeClassifier...\n", "Done!\n", "Prediction time (secs): 0.000\n", - "F1 score for test set: 0.791044776119\n", + "F1 score for test set: 0.802919708029\n", "------------------------------------------\n", "Training set size: 300\n", "Training DecisionTreeClassifier...\n", @@ -476,7 +476,7 @@ "Predicting labels using DecisionTreeClassifier...\n", "Done!\n", "Prediction time (secs): 0.000\n", - "F1 score for test set: 0.736842105263\n" + "F1 score for test set: 0.712121212121\n" ] } ], @@ -526,7 +526,7 @@ "text": [ "Training SVC...\n", "Done!\n", - "Training time (secs): 0.010\n" + "Training time (secs): 0.009\n" ] } ], @@ -562,7 +562,7 @@ "text": [ "Predicting labels using SVC...\n", "Done!\n", - "Prediction time (secs): 0.011\n", + "Prediction time (secs): 0.008\n", "F1 score for training set: 0.858387799564\n" ] } @@ -596,7 +596,7 @@ "text": [ "Predicting labels using SVC...\n", "Done!\n", - "Prediction time (secs): 0.004\n", + "Prediction time (secs): 0.002\n", "F1 score for test set: 0.846153846154\n" ] } @@ -705,7 +705,7 @@ "text": [ "Training KNeighborsClassifier...\n", "Done!\n", - "Training time (secs): 0.001\n" + "Training time (secs): 0.002\n" ] } ], @@ -800,10 +800,10 @@ "Training set size: 100\n", "Training KNeighborsClassifier...\n", "Done!\n", - "Training time (secs): 0.001\n", + "Training time (secs): 0.002\n", "Predicting labels using KNeighborsClassifier...\n", "Done!\n", - "Prediction time (secs): 0.003\n", + "Prediction time (secs): 0.002\n", "F1 score for training set: 0.788321167883\n", "Predicting labels using KNeighborsClassifier...\n", "Done!\n", @@ -820,7 +820,7 @@ "F1 score for training set: 0.834482758621\n", "Predicting labels using KNeighborsClassifier...\n", "Done!\n", - "Prediction time (secs): 0.002\n", + "Prediction time (secs): 0.003\n", "F1 score for test set: 0.797101449275\n", "------------------------------------------\n", "Training set size: 300\n", @@ -833,7 +833,7 @@ "F1 score for training set: 0.855813953488\n", "Predicting labels using KNeighborsClassifier...\n", "Done!\n", - "Prediction time (secs): 0.004\n", + "Prediction time (secs): 0.003\n", "F1 score for test set: 0.768115942029\n" ] } @@ -873,7 +873,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 20, "metadata": { "collapsed": false }, @@ -881,13 +881,13 @@ { "data": { "text/html": [ - "
class 100 train 100 test 200 train 200 test 300 train 300 test
KNeighborsClassifier 0.7883 0.7727 0.8345 0.7971 0.8558 0.7681
SVC 0.8591 0.8333 0.8581 0.8408 0.8584 0.8462
DecisionTreeClassifier 1.0000 0.6724 1.0000 0.7910 1.0000 0.7368
" + "
class 100 train 100 test 200 train 200 test 300 train 300 test
KNeighborsClassifier 0.7883 0.7727 0.8345 0.7971 0.8558 0.7681
SVC 0.8591 0.8333 0.8581 0.8408 0.8584 0.8462
DecisionTreeClassifier 1.0000 0.6325 1.0000 0.8029 1.0000 0.7121
" ], "text/plain": [ "" ] }, - "execution_count": 24, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -931,72 +931,141 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "What single model was chosen as the best. Which model is generally most appropriate based on the available data, limited resources, cost and performance.\n", - "\n", "The Support Vector Classifier was the best performing model. It was compared to K Nearest Neighbors and Decision Trees.\n", "\n", - "The resources, cost and performanc were not a major factor for the given data. This is due to the small amount of data in this example. Although all training and prediction times were very low, training was shortest for k-nearest neighbors because it does it's computation at prediction time, then decision trees, and the SVC was the slowest. Prediction time was shortest for decision trees, then the SVC and slowest for k-nearest neighbors.\n", - "\n", - "\n" + "The resources, cost and performanc were not a major factor for the given data. This is due to the small amount of data in this example. Although all training and prediction times were very low, training was shortest for k-nearest neighbors because it does it's computation at prediction time, then decision trees, and the SVC was the slowest. Prediction time was shortest for decision trees, then the SVC and slowest for k-nearest neighbors.\n" ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Support Vector classification works by linearly separating classes of data. A linear separator is chosen by finding the largest margin between classes. Data points that are closest to the margins on both sides of the separator are considerted the support vecotors.\n", + "\n", + "When a linear separation of data cannot be acoomplished, SVM uses a kernel trick to map the data into a higher dimension. With a kernel trick applied, the data becomes linearly separable." + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# GridSearch tuning\n", + "\n", + "from sklearn import svm, grid_search, datasets\n", + "\n", + "parameters = {'kernel': ('linear', 'rbf'), 'C': [1.0, 10.0], 'gamma': ('auto', 0.01, 0.1, 0.5)}\n", + "\n", + "svr = svm.SVC()\n", + "clf = grid_search.GridSearchCV(svr, parameters)\n", + "clf.fit(X_train, y_train)\n", + "clf\n", + "clf.best_params_\n" + ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 66, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.84810126582278489" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# TODO: Fine-tune your model and report the best F1 score" + "from sklearn.metrics import f1_score\n", + "y_pred = clf.predict(X_test)\n", + "f1 = f1_score(y_test.values, y_pred, pos_label='yes')\n", + "f1" ] }, { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": true - }, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, "source": [ - " # - tune: Kernal, Gamma, C parameter" + "### `GridSearch` results\n", + "\n", + "Using `GridSearch`, a slightly better F1 score of `0.8481` was achived on the test data.\n", + "\n", + "Interestingly my initial parameters to `GridSearch` provided a model that performed worse than the `SVM` model I used earlier. I'm guessing this is because `GridSearch` is using `cross-validation` which is fitting on a subset of the training data, providing worse than ideal results. The worse performing model is shown below." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "{'C': 1.5, 'gamma': 'auto', 'kernel': 'rbf'}" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Worse performing GridSearch tuning\n", + "parameters = {'kernel': ('linear', 'rbf'), 'C': [1.0, 1.5, 10.0], 'gamma': ('auto', 0.01, 0.1, 0.5)}\n", + "\n", + "svr = svm.SVC()\n", + "clf = grid_search.GridSearchCV(svr, parameters)\n", + "clf.fit(X_train, y_train)\n", + "clf\n", + "clf.best_params_\n" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "0.84563758389261745" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import f1_score\n", + "y_pred = clf.predict(X_test)\n", + "f1 = f1_score(y_test.values, y_pred, pos_label='yes')\n", + "f1" + ] }, { "cell_type": "code",