-
Notifications
You must be signed in to change notification settings - Fork 1
/
CyberBullyingDetection-SVM&NB.py
151 lines (130 loc) · 6.01 KB
/
CyberBullyingDetection-SVM&NB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import classify, NaiveBayesClassifier
import re, string, random
#NB- remove noise function
def remove_noise(tweet_tokens, stop_words = ()):
cleaned_tokens = []
#remove noise using regular expressions- like http links, userid tags,hashtags
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
'(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
token = re.sub("(@[A-Za-z0-9_]+)","", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
#NB-get all words function
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
if __name__ == "__main__":
#datasets used
#SVM datasets used
trainData = pd.read_csv("C:\\Users\\Snigdhabose\\Desktop\\major proj\\code\\randforeg1\\train.csv")
testData = pd.read_csv("C:\\Users\\Snigdhabose\\Desktop\\major proj\\code\\randforeg1\\test.csv")
#NB datasets used
#500 pos 500 neg 20000 neutral
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
print("-------------SVM Classifier-----------------\n")
#SVM- Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
max_df = 0.8,
sublinear_tf = True,
use_idf = True)
train_vectors = vectorizer.fit_transform(trainData['Content'])
test_vectors = vectorizer.transform(testData['Content'])
#SVM- Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, trainData['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
#SVM- Classifier Report
print("Sample Train Data for SVM Classifier:")
print(trainData.sample(frac=1).head(5))
print("\nSVM Classifier Report:-")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(testData['Label'], prediction_linear, output_dict=True)
print("*NOTE: F1 = 2 * (precision * recall) / (precision + recall)")
print('positive: ', report['pos'])
print('negative: ', report['neg'])
print("\n-------------NB Classifier-----------------")
#NB- tokenize, stop words
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
#NB-pass tokenized n stop words to remove noise function
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative")
for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
#NB- classifier
classifier = NaiveBayesClassifier.train(train_data)
#NB- classifier report
print("Naive Bayes Classifier Report:-")
print("Accuracy is:", classify.accuracy(classifier, test_data))
print("**MOST COMMON INFORMATIVE FEATURES ARE:-**")
print(classifier.show_most_informative_features(10))
#Both Classifiers Are ready for testing with new data sample
print("\n-------------Testing the models-----------------")
#test1
print("SAMPLE TEXT1:")
custom_tweet = "@shivangi234 is so fat. hahaha. She didn't even deserve the title of Miss glam2020. She is so ugly and fat too. All these shows are a scam."
print(custom_tweet)
#NB
custom_tokens = remove_noise(word_tokenize(custom_tweet))
nbresult=classifier.classify(dict([token, True] for token in custom_tokens))
print("NB Classifier Result:",nbresult)
#SVM
review_vector = vectorizer.transform([custom_tweet]) # vectorizing
if classifier_linear.predict(review_vector)=='neg':
svmresult="Negative"
else:
svmresult="Positive"
print("SVM Classifier Result:",svmresult)
#Compare Results
print("**RESULT**")
if svmresult==nbresult:
if svmresult=='Negative':
print("CyberBullying is Detected (using SVM Classifier and naive Bayes Classifier)")
else:
print("CyberBullying is not Detected (using SVM Classifier and naive Bayes Classifier)")
else:
print("Unable to Detect")