Skip to content

Commit

Permalink
v 1.0.0.14
Browse files Browse the repository at this point in the history
* 简化了读写文件部分的代码;
* 修复了一些小的逻辑bug,简化代码;
* 选择80%的数据作为训练集,20%的数据作为测试集;
* 将一些参数抽离用配置文件配置
  • Loading branch information
Times125 committed Dec 21, 2017
1 parent a4869ec commit f4c0894
Show file tree
Hide file tree
Showing 15 changed files with 70 additions and 60 deletions.
7 changes: 6 additions & 1 deletion src_02/file_path_constant.py → src_02/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,9 @@
mac_test_path = r'E:\Repositories\ML--Native-Bayes\test'
# mac_test_path = r'/Users/lch/Desktop/pycharm/Bayes/test'

dirs = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']
# eng
dirs = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']
categories = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']

# tf-idf阈值
threshold = 0.02
31 changes: 7 additions & 24 deletions src_02/export_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,43 +11,26 @@
import time
import os
from text_processing import get_class_features
from file_path_constant import *
from config import *

__author__ = 'Lich'

categories = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']

'''
'''
建立特征库
'''
def build_features_lib():
print u'正在导出特征,请等待...'
start_time = time.time()
l_features = get_class_features() # 这是自己建立的语料库 [(lst1,cat1),(lst2,cat2),...,(lst7,cat7)]
for item in l_features:
if item[1] == categories[0]:
write_to_file(item)
elif item[1] == categories[1]:
write_to_file(item)
elif item[1] == categories[2]:
write_to_file(item)
elif item[1] == categories[3]:
write_to_file(item)
elif item[1] == categories[4]:
write_to_file(item)
elif item[1] == categories[5]:
write_to_file(item)
elif item[1] == categories[6]:
write_to_file(item)
write_to_file(item)
end_time = time.time()
print u'特征库导出完成! 一共耗时%.4f秒' % (end_time - start_time)
return 0

def write_to_file(item):
if not os.path.exists(mac_f_path):
os.makedirs(mac_f_path)
else:
file_name = os.path.join(mac_f_path, item[1]) + r'.txt'
with codecs.open(file_name, 'wb', 'utf-8') as writer:
txt = ' '.join(item[0]) # list 转 str
writer.write(txt)
file_name = os.path.join(mac_f_path, item[1] + r'.txt')
with codecs.open(file_name, 'wb', 'utf-8') as writer:
txt = ' '.join(item[0]) # list 转 str
writer.write(txt)
8 changes: 3 additions & 5 deletions src_02/import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"""
from export_data import build_features_lib
from openpyxl import load_workbook
from file_path_constant import *
from config import *
import codecs
import re
import time
Expand All @@ -25,11 +25,9 @@ def import_data_from_excel():
start_time = time.time()
log_info = {}
for dir_name in dirs:
path = os.path.join(mac_test_path, dir_name)
wb = load_workbook(os.path.join(path, dir_name + r'.xlsx'))
wb = load_workbook(os.path.join(mac_test_path, dir_name + r'.xlsx'))
print wb.sheetnames
sheet = wb.get_sheet_by_name("sheet1")

tmp_path = os.path.join(mac_path, dir_name)
a = 0
for row in sheet['A']:
Expand Down Expand Up @@ -62,7 +60,7 @@ def import_features_from_lib():
txt = reader.read().decode('ISO-8859-15').encode('utf-8')
txt = re.sub(r'[^\x00-\x7F]+', '', txt) # 去除所有非ASCII字符
lst = txt.split(' ')
print dir_name, "特征包含共%d个词" % len(lst)
print dir_name, u"特征包含共%d个词" % len(lst)
all_features_words = all_features_words | set(lst)
features.append((lst, dir_name)) # [(lst1,cat1),(lst2,cat2),...,(lst7,cat7)]
return features, all_features_words
27 changes: 16 additions & 11 deletions src_02/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import sys
import os
import codecs
from file_path_constant import *
from config import *
from nltk_bayes_classifier import import_features_from_lib, get_model
from nltk_bayes_classifier import import_data_from_excel, train_native_bayes_classifier
from export_data import build_features_lib
Expand All @@ -24,7 +24,7 @@

def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'heltc:', ['classify=', 'help', 'excel', 'train', 'lib'])
opts, args = getopt.getopt(sys.argv[1:], 'heltac:', ['classify=', 'help', 'excel', 'train', 'lib', 'auto'])
except getopt.GetoptError:
sys.exit(-1)
for opt, value in opts:
Expand All @@ -39,8 +39,13 @@ def main():
if opt in ('-t', '--train'):
train()
sys.exit('train successful')
if opt in ('-a', '--auto'):
import_data_from_excel()
build_features_lib()
train()
sys.exit('auto successfully')
if opt in ('-c', '--classify'):
res = classify_text(' '.join(args).decode('utf-8'))
res = classify_text(' '.join(args))
sys.exit(res)


Expand Down Expand Up @@ -82,7 +87,7 @@ def train():

print queue_pool.empty()
while not queue_pool.empty():
res = queue_pool.get(True) # (p_post_list, dir_name, p_vocab_set)
res = queue_pool.get(True) # (p_post_list, p_vocab_set)
for lst in res[0]:
post_list.append(lst)
vocab_set = vocab_set | res[1]
Expand Down Expand Up @@ -174,12 +179,12 @@ def tests():


if __name__ == '__main__':
# main() # 运行程序
program_start_time = time.time()
#program_start_time = time.time()
check_dirs()
import_data_from_excel()
build_features_lib()
train()
print u'全套程序运行共花费时间%.4f', (time.time() - program_start_time)
main() # 运行程序
# import_data_from_excel()
# build_features_lib()
# train()
#print u'全套程序运行共花费时间%.4f'%(time.time() - program_start_time)
# tec
classify_text(u'Google plans to stop Amazon\'s Fire TV streaming devices being able to use YouTube from the start of 2018. The search giant has also blocked a workaround that Amazon introduced to restore YouTube access to a screen-based version of its smart speaker. Experts say the steps mark an escalation of a business row in which consumers have been caught up in the fallout. Amazon had previously stopped selling several of Google\'s hardware products. It removed the latest Nest-branded smart home kit - including a home security system and a new version of its thermostat - from its online stores last month. And since 2015, Amazon has refused to sell Google\'s Chromecast video and audio-streaming dongles. The latest development coincides with the release of Amazon\'s Prime Video app for the Apple TV. Its absence had previously put Apple\'s set-top box at a disadvantage to Amazon\'s Fire TV line-up. Fire TV owners have reported that trying to watch YouTube clips now prompts an alert warning them that they will lose the functionality on 1 January. I use firestick to watch YouTube primarily and suddenly this message appears today. No #youtube on #FireTV from 1/1/18. Great! pic.twitter.com/Pe53chi4ft End of Twitter post by @eqbalashraf "We\'ve been trying to reach agreement with Amazon to give consumers access to each other\'s products and services," Google said in a statement. "But Amazon doesn\'t carry Google products like Chromecast and Google Home, doesn\'t make Prime Video available for Google Cast users, and last month stopped selling some of Nest\'s latest products. "Given this lack of reciprocity, we are no longer supporting YouTube on Echo Show and FireTV. We hope we can reach an agreement to resolve these issues soon." Google had stopped Amazon\'s Echo Show speakers being able to play YouTube videos in September, on the basis that the retailer had altered the way the software worked. The version Amazon presented had lacked next video recommendations, subscriptions and other features - but these were restored in November, when Amazon made the device present a more normal view of YouTube. But, according to Techcrunch, the search firm believes its rights have still been violated because Amazon continues to overlay its own voice controls. Amazon has responded, saying: "Echo Show and Fire TV now display a standard web view of YouTube.com and point customers directly to YouTube\'s existing website. Google is setting a disappointing precedent by selectively blocking customer access to an open website. We hope to resolve this with Google as soon as possible." The dispute disadvantages consumers in two ways. Users will be unable to access a service that Amazon\'s devices had promised to deliver. And Amazon\'s refusal to even allow third-parties to sell certain Google products via its site makes it harder to find them at their lowest price. "It\'s a surprising turn of events in both respects," commented Ben Wood from the CCS Insight tech consultancy. "YouTube is all about maximising the number of people who see its content, and Amazon wants to be the so-called \'everything store\'It\'s all very unfortunate for consumers, who will have little understanding of the commercial tensions between the two companies. "I wonder whether the next step might be the intervention of a regulator to investigate whether they are behaving anti-competitively.')
# classify_text(u'Google plans to stop Amazon\'s Fire TV streaming devices being able to use YouTube from the start of 2018. The search giant has also blocked a workaround that Amazon introduced to restore YouTube access to a screen-based version of its smart speaker. Experts say the steps mark an escalation of a business row in which consumers have been caught up in the fallout. Amazon had previously stopped selling several of Google\'s hardware products. It removed the latest Nest-branded smart home kit - including a home security system and a new version of its thermostat - from its online stores last month. And since 2015, Amazon has refused to sell Google\'s Chromecast video and audio-streaming dongles. The latest development coincides with the release of Amazon\'s Prime Video app for the Apple TV. Its absence had previously put Apple\'s set-top box at a disadvantage to Amazon\'s Fire TV line-up. Fire TV owners have reported that trying to watch YouTube clips now prompts an alert warning them that they will lose the functionality on 1 January. I use firestick to watch YouTube primarily and suddenly this message appears today. No #youtube on #FireTV from 1/1/18. Great! pic.twitter.com/Pe53chi4ft End of Twitter post by @eqbalashraf "We\'ve been trying to reach agreement with Amazon to give consumers access to each other\'s products and services," Google said in a statement. "But Amazon doesn\'t carry Google products like Chromecast and Google Home, doesn\'t make Prime Video available for Google Cast users, and last month stopped selling some of Nest\'s latest products. "Given this lack of reciprocity, we are no longer supporting YouTube on Echo Show and FireTV. We hope we can reach an agreement to resolve these issues soon." Google had stopped Amazon\'s Echo Show speakers being able to play YouTube videos in September, on the basis that the retailer had altered the way the software worked. The version Amazon presented had lacked next video recommendations, subscriptions and other features - but these were restored in November, when Amazon made the device present a more normal view of YouTube. But, according to Techcrunch, the search firm believes its rights have still been violated because Amazon continues to overlay its own voice controls. Amazon has responded, saying: "Echo Show and Fire TV now display a standard web view of YouTube.com and point customers directly to YouTube\'s existing website. Google is setting a disappointing precedent by selectively blocking customer access to an open website. We hope to resolve this with Google as soon as possible." The dispute disadvantages consumers in two ways. Users will be unable to access a service that Amazon\'s devices had promised to deliver. And Amazon\'s refusal to even allow third-parties to sell certain Google products via its site makes it harder to find them at their lowest price. "It\'s a surprising turn of events in both respects," commented Ben Wood from the CCS Insight tech consultancy. "YouTube is all about maximising the number of people who see its content, and Amazon wants to be the so-called \'everything store\'It\'s all very unfortunate for consumers, who will have little understanding of the commercial tensions between the two companies. "I wonder whether the next step might be the intervention of a regulator to investigate whether they are behaving anti-competitively.')
34 changes: 27 additions & 7 deletions src_02/nltk_bayes_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,34 @@
'''
朴素贝叶斯分类器
'''
categories = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']
train_set = []
test_set = []


def train_native_bayes_classifier(m_features, post_list, vocab_set=None):
global word_features
word_features = m_features # [(lst1,cat1),(lst2,cat2),...,(lst7,cat7)]
random.shuffle(post_list) # 打乱顺序
lst_sum = len(post_list)
pre = round(lst_sum * 0.8) # 前80%的数据作为训练集,后20%的数据作为测试集
train_set = post_list[:pre] # [('文档所含单词集','类别'),('文档所含单词集','类别')]
pre = int(round(lst_sum * 0.8)) # 前80%的数据作为训练集,后20%的数据作为测试集
train_set = post_list[:pre] # [('文档所含单词集','类别'),..,('文档所含单词集','类别')]
test_set = post_list[pre:]

for i in range(150):
test_set[i] = (test_set[i][0], categories[random.randint(0, 6)])

train_data = [(doc_features(doc, category), category) for (doc, category) in train_set]
test_data = [(doc_features(doc, category), category) for (doc, category) in test_set]
classifier = nltk.classify.NaiveBayesClassifier.train(train_data)

print len(train_data), '--', len(test_data)
print len(train_set), '--', len(test_set)
for it in test_data:
res = classifier.classify(it[0])
print it[1], '< --------------- >', it[1] == res
print 'test_accuracy is %.7f' % nltk.classify.accuracy(classifier, test_data)
f = open(os.path.join(mac_f_path, 'my_classifier_pickle'), 'wb')
pickle.dump(classifier, f)
f.close()
print 'test_accuracy is %.4f' % nltk.classify.accuracy(classifier, test_data)

'''
获取保存的模型
Expand All @@ -49,30 +57,42 @@ def get_model():
'''
构建一个字典,主要表示输入文档的单词,是否出现在自己构建的语料库中
'''


def doc_features(doc, category):
doc_words = set(doc)
d_features = {}
for wf in word_features:
if category == wf[1]:
cat_words = set(wf[0])
for word in cat_words:
d_features['contains(%s)' % word] = (word in doc_words)
"""
if category == categories[0]:
print category, '----', category == word_features[0][1]
for word in word_features[0][0]:
d_features['contains(%s)' % word] = (word in doc_words)
elif category == categories[1]:
print category, '----', category == word_features[1][1]
for word in word_features[1][0]:
d_features['contains(%s)' % word] = (word in doc_words)
elif category == categories[2]:
print category, '----', category == word_features[2][1]
for word in word_features[2][0]:
d_features['contains(%s)' % word] = (word in doc_words)
elif category == categories[3]:
print category, '----', category == word_features[3][1]
for word in word_features[3][0]:
d_features['contains(%s)' % word] = (word in doc_words)
elif category == categories[4]:
print category, '----', category == word_features[4][1]
for word in word_features[4][0]:
d_features['contains(%s)' % word] = (word in doc_words)
elif category == categories[5]:
print category, '----', category == word_features[5][1]
for word in word_features[5][0]:
d_features['contains(%s)' % word] = (word in doc_words)
elif category == categories[6]:
print category, '----', category == word_features[6][1]
for word in word_features[6][0]:
d_features['contains(%s)' % word] = (word in doc_words)
"""
return d_features
23 changes: 11 additions & 12 deletions src_02/text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import os
from collections import Counter
from threading import Thread
from file_path_constant import *
from config import *
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords as stopwords
from nltk.stem import WordNetLemmatizer
Expand Down Expand Up @@ -47,13 +47,12 @@ def text_parse(input_text, language='eng'):
| (?:[,.;'"?():-_`])"""

tag_list = set(['TO', 'RB', 'RBR', 'RBRS', 'UH', 'WDT', 'WP', 'WP$', 'WRB', 'SYM', 'RP', 'PRP', 'PRP$', 'CD', 'POS',':'])
word_list =regexp_tokenize(sentence, pattern)
word_list = regexp_tokenize(sentence, pattern)
if language is 'eng':
filter_word = [w for w in word_list if w not in stopwords.words('english') and w not in special_tag] # 去停用词和特殊标点符号
else:
filter_word = [w for w in word_list if w not in stopwords.words('french') and w not in special_tag] # 去停用词和特殊标点符号
word_tag = pos_tag(filter_word, tagset=None, lang=language) # 词性标注,返回标记列表[('Codeine', 'NNP'), ('15mg', 'CD')]
print word_tag
word_tag = pos_tag(filter_word, tagset=None, lang=language) # 词性标注,返回标记列表[('Codeine', 'NNP'), ('15mg', 'CD')
res_word_list = []
for i in range(0, len(word_tag)): # 去掉副词、介词、小品词、疑问词、代词、人称代词、所有格代名词等
if word_tag[i][1] in tag_list:
Expand Down Expand Up @@ -213,7 +212,7 @@ def get_class_features():
t_time_end = time.time()
print u'进程耗时%.4f 秒' % (t_time_end - t_time) # 453秒
print u'文本去除停用词、词形还原后还剩余', len(list(total_vocab_set)), u'个不重复单词。'
docs_features = get_doc_features(post_list, total_vocab_set, 0.01) # [[],[],..,[]]
docs_features = get_doc_features(post_list, total_vocab_set, threshold) # [[],[],..,[]]
for i in range(0, len(m_categories)):
if m_categories[i] == 'culture':
cul.extend(docs_features[i])
Expand All @@ -229,13 +228,13 @@ def get_class_features():
sec.extend(docs_features[i])
elif m_categories[i] == 'technology':
tec.extend(docs_features[i])
features.append((env, 'environment'))
features.append((eco, 'economy'))
features.append((pol, 'political'))
features.append((cul, 'culture'))
features.append((sec, 'security'))
features.append((tec, 'technology'))
features.append((ene, 'energy'))
features.append((list(set(env)), 'environment'))
features.append((list(set(eco)), 'economy'))
features.append((list(set(pol)), 'political'))
features.append((list(set(cul)), 'culture'))
features.append((list(set(sec)), 'security'))
features.append((list(set(tec)), 'technology'))
features.append((list(set(ene)), 'energy'))

end_time = time.time()
print 'method get_class_features() cost total time %0.4f seconds' % (end_time - start_time) # 530秒
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file added test/technology.xlsx
Binary file not shown.
Binary file removed test/technology/technology.xlsx
Binary file not shown.

0 comments on commit f4c0894

Please sign in to comment.