Finish general qa on some domain.

Mars-Wei · Nov 3, 2016 · 9e76fb4 · 9e76fb4
1 parent b01872a
commit 9e76fb4
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 38 deletions.
diff --git a/Chatbot/QuestionAnswering/__init__.py b/Chatbot/QuestionAnswering/__init__.py
@@ -0,0 +1,3 @@
+import os
+import sys
+sys.path.append(os.path.dirname(__file__))
diff --git a/Chatbot/QuestionAnswering/chat.py b/Chatbot/QuestionAnswering/chat.py
@@ -10,7 +10,7 @@
 def main():
 
     chatter = GossipBot()
-    #chatter.randomTalks()
+    #chatter.randomTalks(num=1000)
     chatter.chatTime()
 
 
@@ -57,15 +57,19 @@ def getResponse(self,query,threshold=50):
             return reply
 
     def randomPick(self, answers):
-        return answers[random.randrange(0,len(answers))][0]
+        try:
+            answer = answers[random.randrange(0,len(answers))][0]
+        except:
+            answer = "沒有資料"
+        return answer
 
     def randomTalks(self, num=100):
         with open("data/Titles.txt",'r',encoding='utf-8') as data:
             titles = [line.strip('\n') for line in data]
         for i in range(0,num):
             query = titles[random.randrange(0,len(titles))]
             print("User: " + query)
-            print("MianBot: " +self.getResponse(query))
+            print("MianBot: " +self.getResponse(query) + "\n")
 
 if __name__=="__main__":
     main()
diff --git a/Chatbot/QuestionAnswering/match.py b/Chatbot/QuestionAnswering/match.py
@@ -2,13 +2,10 @@
 import os
 import random
 
-from responsesEvaluate import Evaluator
-from Matcher.fuzzyMatcher import FuzzyMatcher
-from Matcher.wordWeightMatcher import WordWeightMatcher
-from Matcher.matcher import Matcher
-
-def main():
-    matcherTesting("Fuzzy",removeStopWords=False)
+from .responsesEvaluate import Evaluator
+from .Matcher.fuzzyMatcher import FuzzyMatcher
+from .Matcher.wordWeightMatcher import WordWeightMatcher
+from .Matcher.matcher import Matcher
 
 def getMatcher(matcherType,removeStopWords=False):
 
@@ -38,13 +35,15 @@ def getMatcher(matcherType,removeStopWords=False):
 def matcherTesting(matcherType,removeStopWords=False):
 
     matcher = getMatcher(matcherType,removeStopWords)
+    cur_path = os.path.dirname(__file__)
+
     while True:
         query = input("隨便說些什麼吧: ")
         title,index = matcher.match(query)
         sim = matcher.getSimilarity()
         print("最為相似的標題是 %s ，相似度為 %d " % (title,sim))
 
-        res = json.load(open(os.path.join("data/processed/reply/",str(int(index/1000))+'.json'),'r',encoding='utf-8'))
+        res = json.load(open(os.path.join(cur_path+"data/processed/reply/",str(int(index/1000))+'.json'),'r',encoding='utf-8'))
         targetId = index % 1000
         #randomId = random.randrange(0,len(res[targetId]))
 
@@ -63,8 +62,11 @@ def woreWeightMatch():
 
 def fuzzyMatch(cleansw=False):
 
+    cur_dir = os.getcwd()
+    os.chdir(os.path.dirname(__file__))
     fuzzyMatcher = FuzzyMatcher(segLib="Taiba",removeStopWords=cleansw)
     fuzzyMatcher.loadTitles(path="data/Titles.txt")
+    os.chdir(cur_dir)
 
     if cleansw:
         fuzzyMatcher.TitlesSegmentation(cleansw)
@@ -79,7 +81,3 @@ def fuzzyMatch(cleansw=False):
     #fuzzyMatcher.loadStopWords(path="data/stopwords/chinese_sw.txt")
     #fuzzyMatcher.loadStopWords(path="data/stopwords/ptt_words.txt")
     #fuzzyMatcher.loadStopWords(path="data/stopwords/specialMarks.txt")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/Chatbot/QuestionAnswering/qaBase.py b/Chatbot/QuestionAnswering/qaBase.py
@@ -1,36 +1,61 @@
 import json
+import logging
+import os
 
 from .match import *
 from .responsesEvaluate import Evaluator
 
-class QABase(object):
+class Answerer(object):
 
-    def __init__(self, data_path):
+    def __init__(self):
 
-        """
-        Args:
-            data_path: 指出 data 資料夾位在哪個位置
-        """
         self.general_questions = []
-        self.data_path = data_path
+        self.path = os.path.dirname(__file__)
 
         self.matcher = getMatcher(matcherType="Fuzzy")
         self.evaluator = Evaluator()
+        self.testSegment()
+
+    def testSegment(self):
+        logging.info("測試斷詞模塊中")
+        try:
+            self.matcher.wordSegmentation("測試一下斷詞")
+            logging.info("測試成功")
+        except Exception as e:
+            logging.info(repr(e))
+            logging.info("模塊載入失敗，請確認data與字典齊全")
 
     def getResponse(self, sentence, api_key=None):
 
         if api_key is not None:
             response = self.getCustomQA(sentence,api_key)
         else:
-            response = self._getGeneralQA(sentence)
+            response = self.getGeneralQA(sentence)
         return response
 
-    def getGeneralQA(self, sentence):
+    def getGeneralQA(self,query,threshold=50):
 
-        pass
+        title,index = self.matcher.match(query)
+        sim = self.matcher.getSimilarity()
+        if sim < threshold:
+            return None
+        else:
+            res = json.load(open(os.path.join(self.path+"/data/processed/reply/",str(int(index/1000))+'.json'),
+                            'r',encoding='utf-8'))
+            targetId = index % 1000
+            candiates = self.evaluator.getBestResponse(res[targetId],topk=3)
+            reply = self.randomPick(candiates)
+            return reply
+
+    def randomPick(self, answers):
+        try:
+            answer = answers[random.randrange(0,len(answers))][0]
+        except:
+            answer = None
+        return answer
 
     def getCustomQA(self, sentence, api_key):
 
         #TODO GET USER'S QA BY api_key
         #customqa_list = json.loads(getUserQA(api_key))
-        pass
+        return None
diff --git a/Chatbot/QuestionAnswering/responsesEvaluate.py b/Chatbot/QuestionAnswering/responsesEvaluate.py
@@ -7,7 +7,7 @@
 from gensim import corpora
 
 # 引入斷詞與停用詞的配置
-from Matcher.matcher import Matcher
+from .Matcher.matcher import Matcher
 
 class Evaluator(Matcher):
     """
@@ -21,17 +21,18 @@ def __init__(self,segLib="Taiba"):
         self.segResponses = []
         self.totalWords = 0
 
-        self.debugLog = open("data/EvaluateLog.txt",'w',encoding="utf-8")
+        self.path = os.path.dirname(__file__)
+        self.debugLog = open(self.path + "/data/EvaluateLog.txt",'w',encoding="utf-8")
 
         self.filteredWords = set() # 必須濾除的回應
 
         self.counterDictionary = defaultdict(int) # 用於統計詞頻
         self.tokenDictionary = None # 用於分配詞 id，與建置詞袋
 
         # 中文停用詞與特殊符號加載
-        self.loadStopWords(path="data/stopwords/chinese_sw.txt")
-        self.loadStopWords(path="data/stopwords/specialMarks.txt")
-        self.loadFilterdWord(path="data/stopwords/ptt_words.txt")
+        self.loadStopWords(path=self.path + "/data/stopwords/chinese_sw.txt")
+        self.loadStopWords(path=self.path + "/data/stopwords/specialMarks.txt")
+        self.loadFilterdWord(path=self.path + "/data/stopwords/ptt_words.txt")
 
     def cleanFormerResult(self):
         """

diff --git a/Chatbot/chatbot.py b/Chatbot/chatbot.py
@@ -5,6 +5,7 @@
 import console
 import task_modules.module_switch as module_switch
 import RuleMatcher.customRuleBase as crb
+import QuestionAnswering.qaBase as qa
 
 class Chatbot(object):
 
@@ -26,11 +27,14 @@ def __init__(self, name="NCKU"):
         self.exception_log = open('log/exception.log','w',encoding='utf-8')
         os.chdir(cur_dir)
 
+        # For rule matching
         self.console = console.Console(model_path="model/ch-corpus-3sg.bin")
-
         self.custom_rulebase = crb.CustomRuleBase() # for one time matching.
         self.custom_rulebase.model = self.console.rb.model # pass word2vec model
 
+        # For QA
+        self.answerer = qa.Answerer()
+
         self.default_response = [
             "是嗎?",
             "我不太明白你的意思",
@@ -107,7 +111,7 @@ def listen(self, sentence, target=None, api_key=None):
                 # We can only send back a default response.
                 return self.getDefaultResponse(),None,None,None
 
-                #TODO 
+                #TODO
                 # Use generative model to solve this case
 
     def getResponseOnRootDomains(self, target=None):
@@ -177,8 +181,7 @@ def getResponseForGeneralQA(self, sentence):
         Listen user's input and return a response which is based on our
         knowledge base.
         """
-        #TODO 接上 QA bot
-        pass
+        return self.answerer.getResponse(sentence)
 
     def getResponseForCustomQA(self,sentence,api_key):
 
@@ -188,9 +191,7 @@ def getResponseForCustomQA(self,sentence,api_key):
         """
         if api_key is None:
             return None
-
-        #TODO 接上 QA bot
-        return None
+        return self.answerer.getResponse(sentence,api_key)
 
     def getLoggerData(self):
         return [self.root_domain,