-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Your Name
committed
Sep 20, 2018
1 parent
b87a9cc
commit ba13c18
Showing
13 changed files
with
851 additions
and
36,744 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,369 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from lib import *\n", | ||
"from features import *" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# importing data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = pd.read_csv('./train_data.csv')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = data.sample(frac=0.4).reset_index(drop=True)\n", | ||
"size = data.shape\n", | ||
"size" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Data Visualisation\n", | ||
"#showing data distribution over the four categories of headlines\n", | ||
"m-Medical\n", | ||
"e-Entertainment\n", | ||
"b-Bussiness\n", | ||
"t-Tech" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"plt.hist(data.CATEGORY.factorize()[0])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Data samples" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"for line,i in zip(data['TITLE'],range(data['TITLE'].shape[0])):\n", | ||
" data.loc[i,('TITLE')] = normalise_text(line)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"cv_matrix, cv = countVectorizer(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tv_matrix, tv = tfidfTransformer(cv_matrix)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"categories = data.CATEGORY.factorize()[0]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"training_data, testing_data, training_op, test_op = split_data(tv_matrix,categories)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"svm = SVC()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 40, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"rfc_model = RandomForestClassifier(min_samples_split=4,criterion='entropy',random_state=10)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 41, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n", | ||
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n", | ||
" min_impurity_decrease=0.0, min_impurity_split=None,\n", | ||
" min_samples_leaf=1, min_samples_split=4,\n", | ||
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", | ||
" oob_score=False, random_state=10, verbose=0, warm_start=False)" | ||
] | ||
}, | ||
"execution_count": 41, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"rfc_model.fit(training_data,training_op)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 42, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"0.9906786808113986" | ||
] | ||
}, | ||
"execution_count": 42, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"rfc_model.score(training_data,training_op)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 43, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"0.8857193584659999" | ||
] | ||
}, | ||
"execution_count": 43, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"rfc_model.score(testing_data,test_op)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 53, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#url = \"http://www.bbc.com/\"\n", | ||
"url = \"https://in.yahoo.com/?p=us\"\n", | ||
"headlines = extract_hedlines(url)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 54, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"for lines, i in zip(headlines, range(len(headlines))):\n", | ||
" headlines[i] = normalise_text(lines)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 55, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"testing_headlines = pd.DataFrame({\"TITLE\":headlines})" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 56, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"testing_cv_matrix = cv.transform(testing_headlines['TITLE'])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 57, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"testing_cv_matrix = testing_cv_matrix.toarray()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 58, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"transformed_testing_data = tv.transform(testing_cv_matrix) " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 59, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"prediction = rfc_model.predict(transformed_testing_data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 60, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"opposition parties like dmk ncp rjd , jd ( ) , extended support bandh . t\n", | ||
"weekly horoscope 10th sep 16th sep 2018 ) analysis provide insights crucial planetary movements impact life . stay tuned astroyogi ’ expert astrologers horoscope analysis . single , marriage proposals expected chances falling love first sight . take care health avoid smoking drinking alcohol . m\n", | ||
"heartbreaking news chelsi smith first texan win miss universe crown , passed away year-long battle liver cancer age 45. former beauty queen bollywood actor sushmita sen , e\n", | ||
"sanghavis car blue-coloured maruti ignis , found police near sector 11 airoli , navi mumbai , thursday . police officer said blood stains knife found rear seat car . b\n", | ||
"shahid kapoor mira rajput became parents second time baby boy . e\n", | ||
"paro derogatory term used women trafficked sold brides men haryana e\n", | ||
"apache rtr 160 4v carburettor model got covered comes racing heritage tvs motor read detailed review new tvs apache rtr 160 4v e\n", | ||
"aishwarya rai bachchan gets emotional listening national anthem event video aishwarya rai bachchan making rounds social media work front , aishwarya next seen husband abhishek bachchan gulab jamun . check video ! # aishwaryaraibachchan # aishwaryaemotional # nationalanthem e\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for i in range(testing_headlines['TITLE'].shape[0]):\n", | ||
" if(len(testing_headlines.TITLE[i])>20):\n", | ||
" print(testing_headlines.TITLE[i],\" \",data.CATEGORY.factorize()[1][prediction[i]])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Binary file not shown.
Binary file not shown.
Oops, something went wrong.