v 1.0.0.14

* 简化了读写文件部分的代码； * 修复了一些小的逻辑bug,简化代码； * 选择80%的数据作为训练集，20%的数据作为测试集； * 将一些参数抽离用配置文件配置
lichanging · Dec 21, 2017 · f4c0894 · f4c0894
1 parent a4869ec
commit f4c0894
Show file tree

Hide file tree

Showing 15 changed files with 70 additions and 60 deletions.
diff --git a/src_02/file_path_constant.py → src_02/config.py b/src_02/file_path_constant.py → src_02/config.py
@@ -15,4 +15,9 @@
 mac_test_path = r'E:\Repositories\ML--Native-Bayes\test'
 # mac_test_path = r'/Users/lch/Desktop/pycharm/Bayes/test'
 
-dirs = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']
+# eng
+dirs = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']
+categories = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']
+
+# tf-idf阈值
+threshold = 0.02
diff --git a/src_02/export_data.py b/src_02/export_data.py
@@ -11,43 +11,26 @@
 import time
 import os
 from text_processing import get_class_features
-from file_path_constant import *
+from config import *
 
 __author__ = 'Lich'
 
-categories = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']
 
-'''
+'''  
 建立特征库
 '''
 def build_features_lib():
     print u'正在导出特征，请等待...'
     start_time = time.time()
     l_features = get_class_features()  # 这是自己建立的语料库 [(lst1,cat1),(lst2,cat2),...,(lst7,cat7)]
     for item in l_features:
-        if item[1] == categories[0]:
-            write_to_file(item)
-        elif item[1] == categories[1]:
-            write_to_file(item)
-        elif item[1] == categories[2]:
-            write_to_file(item)
-        elif item[1] == categories[3]:
-            write_to_file(item)
-        elif item[1] == categories[4]:
-            write_to_file(item)
-        elif item[1] == categories[5]:
-            write_to_file(item)
-        elif item[1] == categories[6]:
-            write_to_file(item)
+        write_to_file(item)
     end_time = time.time()
     print u'特征库导出完成！ 一共耗时%.4f秒' % (end_time - start_time)
     return 0
 
 def write_to_file(item):
-    if not os.path.exists(mac_f_path):
-        os.makedirs(mac_f_path)
-    else:
-        file_name = os.path.join(mac_f_path, item[1]) + r'.txt'
-        with codecs.open(file_name, 'wb', 'utf-8') as writer:
-            txt = ' '.join(item[0])  # list 转 str
-            writer.write(txt)
+    file_name = os.path.join(mac_f_path, item[1] + r'.txt')
+    with codecs.open(file_name, 'wb', 'utf-8') as writer:
+        txt = ' '.join(item[0])  # list 转 str
+        writer.write(txt)
diff --git a/src_02/import_data.py b/src_02/import_data.py
@@ -8,7 +8,7 @@
 """
 from export_data import build_features_lib
 from openpyxl import load_workbook
-from file_path_constant import *
+from config import *
 import codecs
 import re
 import time
@@ -25,11 +25,9 @@ def import_data_from_excel():
     start_time = time.time()
     log_info = {}
     for dir_name in dirs:
-        path = os.path.join(mac_test_path, dir_name)
-        wb = load_workbook(os.path.join(path, dir_name + r'.xlsx'))
+        wb = load_workbook(os.path.join(mac_test_path, dir_name + r'.xlsx'))
         print wb.sheetnames
         sheet = wb.get_sheet_by_name("sheet1")
-
         tmp_path = os.path.join(mac_path, dir_name)
         a = 0
         for row in sheet['A']:
@@ -62,7 +60,7 @@ def import_features_from_lib():
             txt = reader.read().decode('ISO-8859-15').encode('utf-8')
             txt = re.sub(r'[^\x00-\x7F]+', '', txt)  # 去除所有非ASCII字符
             lst = txt.split(' ')
-            print dir_name, "特征包含共%d个词" % len(lst)
+            print dir_name, u"特征包含共%d个词" % len(lst)
             all_features_words = all_features_words | set(lst)
             features.append((lst, dir_name))  # [(lst1,cat1),(lst2,cat2),...,(lst7,cat7)]
     return features, all_features_words
diff --git a/src_02/main.py b/src_02/main.py
@@ -11,7 +11,7 @@
 import sys
 import os
 import codecs
-from file_path_constant import *
+from config import *
 from nltk_bayes_classifier import import_features_from_lib, get_model
 from nltk_bayes_classifier import import_data_from_excel, train_native_bayes_classifier
 from export_data import build_features_lib
@@ -24,7 +24,7 @@
 
 def main():
     try:
-        opts, args = getopt.getopt(sys.argv[1:], 'heltc:', ['classify=', 'help', 'excel', 'train', 'lib'])
+        opts, args = getopt.getopt(sys.argv[1:], 'heltac:', ['classify=', 'help', 'excel', 'train', 'lib', 'auto'])
     except getopt.GetoptError:
         sys.exit(-1)
     for opt, value in opts:
@@ -39,8 +39,13 @@ def main():
         if opt in ('-t', '--train'):
             train()
             sys.exit('train successful')
+        if opt in ('-a', '--auto'):
+            import_data_from_excel()
+            build_features_lib()
+            train()
+            sys.exit('auto successfully')
         if opt in ('-c', '--classify'):
-            res = classify_text(' '.join(args).decode('utf-8'))
+            res = classify_text(' '.join(args))
             sys.exit(res)
 
 
@@ -82,7 +87,7 @@ def train():
 
     print queue_pool.empty()
     while not queue_pool.empty():
-        res = queue_pool.get(True)  # (p_post_list, dir_name, p_vocab_set)
+        res = queue_pool.get(True)  # (p_post_list, p_vocab_set)
         for lst in res[0]:
             post_list.append(lst)
         vocab_set = vocab_set | res[1]
@@ -174,12 +179,12 @@ def tests():
 
 
 if __name__ == '__main__':
-    # main()  # 运行程序
-    program_start_time = time.time()
+    #program_start_time = time.time()
     check_dirs()
-    import_data_from_excel()
-    build_features_lib()
-    train()
-    print u'全套程序运行共花费时间%.4f', (time.time() - program_start_time)
+    main()  # 运行程序
+    # import_data_from_excel()
+    # build_features_lib()
+    # train()
+    #print u'全套程序运行共花费时间%.4f'%(time.time() - program_start_time)
     # tec
-    classify_text(u'Google plans to stop Amazon\'s Fire TV streaming devices being able to use YouTube from the start of 2018. The search giant has also blocked a workaround that Amazon introduced to restore YouTube access to a screen-based version of its smart speaker. Experts say the steps mark an escalation of a business row in which consumers have been caught up in the fallout. Amazon had previously stopped selling several of Google\'s hardware products. It removed the latest Nest-branded smart home kit - including a home security system and a new version of its thermostat - from its online stores last month. And since 2015, Amazon has refused to sell Google\'s Chromecast video and audio-streaming dongles. The latest development coincides with the release of Amazon\'s Prime Video app for the Apple TV. Its absence had previously put Apple\'s set-top box at a disadvantage to Amazon\'s Fire TV line-up. Fire TV owners have reported that trying to watch YouTube clips now prompts an alert warning them that they will lose the functionality on 1 January. I use firestick to watch YouTube primarily and suddenly this message appears today. No #youtube on #FireTV from 1/1/18. Great! pic.twitter.com/Pe53chi4ft End of Twitter post by @eqbalashraf "We\'ve been trying to reach agreement with Amazon to give consumers access to each other\'s products and services," Google said in a statement. "But Amazon doesn\'t carry Google products like Chromecast and Google Home, doesn\'t make Prime Video available for Google Cast users, and last month stopped selling some of Nest\'s latest products. "Given this lack of reciprocity, we are no longer supporting YouTube on Echo Show and FireTV. We hope we can reach an agreement to resolve these issues soon." Google had stopped Amazon\'s Echo Show speakers being able to play YouTube videos in September, on the basis that the retailer had altered the way the software worked. The version Amazon presented had lacked next video recommendations, subscriptions and other features - but these were restored in November, when Amazon made the device present a more normal view of YouTube. But, according to Techcrunch, the search firm believes its rights have still been violated because Amazon continues to overlay its own voice controls. Amazon has responded, saying: "Echo Show and Fire TV now display a standard web view of YouTube.com and point customers directly to YouTube\'s existing website. Google is setting a disappointing precedent by selectively blocking customer access to an open website. We hope to resolve this with Google as soon as possible." The dispute disadvantages consumers in two ways. Users will be unable to access a service that Amazon\'s devices had promised to deliver. And Amazon\'s refusal to even allow third-parties to sell certain Google products via its site makes it harder to find them at their lowest price. "It\'s a surprising turn of events in both respects," commented Ben Wood from the CCS Insight tech consultancy. "YouTube is all about maximising the number of people who see its content, and Amazon wants to be the so-called \'everything store\'It\'s all very unfortunate for consumers, who will have little understanding of the commercial tensions between the two companies. "I wonder whether the next step might be the intervention of a regulator to investigate whether they are behaving anti-competitively.')
+    # classify_text(u'Google plans to stop Amazon\'s Fire TV streaming devices being able to use YouTube from the start of 2018. The search giant has also blocked a workaround that Amazon introduced to restore YouTube access to a screen-based version of its smart speaker. Experts say the steps mark an escalation of a business row in which consumers have been caught up in the fallout. Amazon had previously stopped selling several of Google\'s hardware products. It removed the latest Nest-branded smart home kit - including a home security system and a new version of its thermostat - from its online stores last month. And since 2015, Amazon has refused to sell Google\'s Chromecast video and audio-streaming dongles. The latest development coincides with the release of Amazon\'s Prime Video app for the Apple TV. Its absence had previously put Apple\'s set-top box at a disadvantage to Amazon\'s Fire TV line-up. Fire TV owners have reported that trying to watch YouTube clips now prompts an alert warning them that they will lose the functionality on 1 January. I use firestick to watch YouTube primarily and suddenly this message appears today. No #youtube on #FireTV from 1/1/18. Great! pic.twitter.com/Pe53chi4ft End of Twitter post by @eqbalashraf "We\'ve been trying to reach agreement with Amazon to give consumers access to each other\'s products and services," Google said in a statement. "But Amazon doesn\'t carry Google products like Chromecast and Google Home, doesn\'t make Prime Video available for Google Cast users, and last month stopped selling some of Nest\'s latest products. "Given this lack of reciprocity, we are no longer supporting YouTube on Echo Show and FireTV. We hope we can reach an agreement to resolve these issues soon." Google had stopped Amazon\'s Echo Show speakers being able to play YouTube videos in September, on the basis that the retailer had altered the way the software worked. The version Amazon presented had lacked next video recommendations, subscriptions and other features - but these were restored in November, when Amazon made the device present a more normal view of YouTube. But, according to Techcrunch, the search firm believes its rights have still been violated because Amazon continues to overlay its own voice controls. Amazon has responded, saying: "Echo Show and Fire TV now display a standard web view of YouTube.com and point customers directly to YouTube\'s existing website. Google is setting a disappointing precedent by selectively blocking customer access to an open website. We hope to resolve this with Google as soon as possible." The dispute disadvantages consumers in two ways. Users will be unable to access a service that Amazon\'s devices had promised to deliver. And Amazon\'s refusal to even allow third-parties to sell certain Google products via its site makes it harder to find them at their lowest price. "It\'s a surprising turn of events in both respects," commented Ben Wood from the CCS Insight tech consultancy. "YouTube is all about maximising the number of people who see its content, and Amazon wants to be the so-called \'everything store\'It\'s all very unfortunate for consumers, who will have little understanding of the commercial tensions between the two companies. "I wonder whether the next step might be the intervention of a regulator to investigate whether they are behaving anti-competitively.')
diff --git a/src_02/nltk_bayes_classifier.py b/src_02/nltk_bayes_classifier.py
@@ -15,26 +15,34 @@
 '''
 朴素贝叶斯分类器
 '''
-categories = ['culture', 'economy', 'energy', 'environment', 'political', 'security', 'technology']
 train_set = []
 test_set = []
 
-
 def train_native_bayes_classifier(m_features, post_list, vocab_set=None):
     global word_features
     word_features = m_features   # [(lst1,cat1),(lst2,cat2),...,(lst7,cat7)]
     random.shuffle(post_list)  # 打乱顺序
     lst_sum = len(post_list)
-    pre = round(lst_sum * 0.8)  # 前80%的数据作为训练集,后20%的数据作为测试集
-    train_set = post_list[:pre]  # [('文档所含单词集','类别'),('文档所含单词集','类别')]
+    pre = int(round(lst_sum * 0.8))  # 前80%的数据作为训练集,后20%的数据作为测试集
+    train_set = post_list[:pre]  # [('文档所含单词集','类别'),..,('文档所含单词集','类别')]
     test_set = post_list[pre:]
+
+    for i in range(150):
+        test_set[i] = (test_set[i][0], categories[random.randint(0, 6)])
+
     train_data = [(doc_features(doc, category), category) for (doc, category) in train_set]
     test_data = [(doc_features(doc, category), category) for (doc, category) in test_set]
     classifier = nltk.classify.NaiveBayesClassifier.train(train_data)
+
+    print len(train_data), '--', len(test_data)
+    print len(train_set), '--', len(test_set)
+    for it in test_data:
+        res = classifier.classify(it[0])
+        print it[1], '< --------------- >', it[1] == res
+    print 'test_accuracy is %.7f' % nltk.classify.accuracy(classifier, test_data)
     f = open(os.path.join(mac_f_path, 'my_classifier_pickle'), 'wb')
     pickle.dump(classifier, f)
     f.close()
-    print 'test_accuracy is %.4f' % nltk.classify.accuracy(classifier, test_data)
 
 '''
 获取保存的模型
@@ -49,30 +57,42 @@ def get_model():
 '''
 构建一个字典，主要表示输入文档的单词，是否出现在自己构建的语料库中
 '''
-
-
 def doc_features(doc, category):
     doc_words = set(doc)
     d_features = {}
+    for wf in word_features:
+        if category == wf[1]:
+            cat_words = set(wf[0])
+            for word in cat_words:
+                d_features['contains(%s)' % word] = (word in doc_words)
+    """
     if category == categories[0]:
+        print category, '----', category == word_features[0][1]
         for word in word_features[0][0]:
             d_features['contains(%s)' % word] = (word in doc_words)
     elif category == categories[1]:
+        print category, '----', category == word_features[1][1]
         for word in word_features[1][0]:
             d_features['contains(%s)' % word] = (word in doc_words)
     elif category == categories[2]:
+        print category, '----', category == word_features[2][1]
         for word in word_features[2][0]:
             d_features['contains(%s)' % word] = (word in doc_words)
     elif category == categories[3]:
+        print category, '----', category == word_features[3][1]
         for word in word_features[3][0]:
             d_features['contains(%s)' % word] = (word in doc_words)
     elif category == categories[4]:
+        print category, '----', category == word_features[4][1]
         for word in word_features[4][0]:
             d_features['contains(%s)' % word] = (word in doc_words)
     elif category == categories[5]:
+        print category, '----', category == word_features[5][1]
         for word in word_features[5][0]:
             d_features['contains(%s)' % word] = (word in doc_words)
     elif category == categories[6]:
+        print category, '----', category == word_features[6][1]
         for word in word_features[6][0]:
             d_features['contains(%s)' % word] = (word in doc_words)
+    """
     return d_features
diff --git a/src_02/text_processing.py b/src_02/text_processing.py
@@ -13,7 +13,7 @@
 import os
 from collections import Counter
 from threading import Thread
-from file_path_constant import *
+from config import *
 from nltk import pos_tag, pos_tag_sents
 from nltk.corpus import stopwords as stopwords
 from nltk.stem import WordNetLemmatizer
@@ -47,13 +47,12 @@ def text_parse(input_text, language='eng'):
                   | (?:[,.;'"?():-_`])"""
 
     tag_list = set(['TO', 'RB', 'RBR', 'RBRS', 'UH', 'WDT', 'WP', 'WP$', 'WRB', 'SYM', 'RP', 'PRP', 'PRP$', 'CD', 'POS',':'])
-    word_list =regexp_tokenize(sentence, pattern)
+    word_list = regexp_tokenize(sentence, pattern)
     if language is 'eng':
         filter_word = [w for w in word_list if w not in stopwords.words('english') and w not in special_tag]  # 去停用词和特殊标点符号
     else:
         filter_word = [w for w in word_list if w not in stopwords.words('french') and w not in special_tag]  # 去停用词和特殊标点符号
-    word_tag = pos_tag(filter_word, tagset=None, lang=language)  # 词性标注，返回标记列表[('Codeine', 'NNP'), ('15mg', 'CD')]
-    print word_tag
+    word_tag = pos_tag(filter_word, tagset=None, lang=language)  # 词性标注，返回标记列表[('Codeine', 'NNP'), ('15mg', 'CD')
     res_word_list = []
     for i in range(0, len(word_tag)):  # 去掉副词、介词、小品词、疑问词、代词、人称代词、所有格代名词等
         if word_tag[i][1] in tag_list:
@@ -213,7 +212,7 @@ def get_class_features():
     t_time_end = time.time()
     print u'进程耗时%.4f 秒' % (t_time_end - t_time) # 453秒
     print u'文本去除停用词、词形还原后还剩余', len(list(total_vocab_set)), u'个不重复单词。'
-    docs_features = get_doc_features(post_list, total_vocab_set, 0.01)  # [[],[],..,[]]
+    docs_features = get_doc_features(post_list, total_vocab_set, threshold)  # [[],[],..,[]]
     for i in range(0, len(m_categories)):
         if m_categories[i] == 'culture':
             cul.extend(docs_features[i])
@@ -229,13 +228,13 @@ def get_class_features():
             sec.extend(docs_features[i])
         elif m_categories[i] == 'technology':
             tec.extend(docs_features[i])
-    features.append((env, 'environment'))
-    features.append((eco, 'economy'))
-    features.append((pol, 'political'))
-    features.append((cul, 'culture'))
-    features.append((sec, 'security'))
-    features.append((tec, 'technology'))
-    features.append((ene, 'energy'))
+    features.append((list(set(env)), 'environment'))
+    features.append((list(set(eco)), 'economy'))
+    features.append((list(set(pol)), 'political'))
+    features.append((list(set(cul)), 'culture'))
+    features.append((list(set(sec)), 'security'))
+    features.append((list(set(tec)), 'technology'))
+    features.append((list(set(ene)), 'energy'))
 
     end_time = time.time()
     print 'method get_class_features() cost total time %0.4f seconds' % (end_time - start_time)  # 530秒

diff --git a/test/culture/culture.xlsx → test/culture.xlsx b/test/culture/culture.xlsx → test/culture.xlsx
diff --git a/test/economy/economy.xlsx → test/economy.xlsx b/test/economy/economy.xlsx → test/economy.xlsx
diff --git a/test/energy/energy.xlsx → test/energy.xlsx b/test/energy/energy.xlsx → test/energy.xlsx
diff --git a/test/environment/environment.xlsx → test/environment.xlsx b/test/environment/environment.xlsx → test/environment.xlsx
diff --git a/test/political/political.xlsx → test/political.xlsx b/test/political/political.xlsx → test/political.xlsx
diff --git a/test/security/security.xlsx → test/security.xlsx b/test/security/security.xlsx → test/security.xlsx
diff --git a/test/technology/tec2.xlsx → test/tec2.xlsx b/test/technology/tec2.xlsx → test/tec2.xlsx
diff --git a/test/technology.xlsx b/test/technology.xlsx
diff --git a/test/technology/technology.xlsx b/test/technology/technology.xlsx