Skip to content

Commit

Permalink
Close the unimplemented module.
Browse files Browse the repository at this point in the history
  • Loading branch information
zake7749 committed Nov 23, 2016
1 parent d95429a commit 863aefa
Show file tree
Hide file tree
Showing 40 changed files with 1,258,299 additions and 54 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ __pycache__
*.model
*.log
.DS_Store
*log.txt
*log.txt
Taiba/
11 changes: 11 additions & 0 deletions Chatbot/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
*.bin
__pycache__
*.pyc
*.train
*.model
*.log
.DS_Store
*log.txt
Taiba
QuestionAnswering/data/processed/reply
jieba_dictionary/dict.txt.big
15 changes: 15 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/KeywordMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .matcher import Matcher

class KeywordMatcher(Matcher):

"""
基於 TF-IDF 比較短語相似度
"""

def __init__(self):

self.vecModel = None
#TODO

def match(self, query):
#TODO
Empty file.
1 change: 1 addition & 0 deletions Chatbot/QuestionAnswering/Matcher/deepLearning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#TODO
83 changes: 83 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/fuzzyMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from .matcher import Matcher
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

class FuzzyMatcher(Matcher):

"""
基於萊文斯坦距離比對短語相似度
"""

def __init__(self, segLib="Taiba", removeStopWords=False):
super().__init__(segLib)
self.cleanStopWords = removeStopWords
if removeStopWords:
self.loadStopWords("data/stopwords/chinese_sw.txt")
self.loadStopWords("data/stopwords/specialMarks.txt")

def joinTitles(self):
self.segTitles = ["".join(title) for title in self.segTitles]

def tieBreak(self, query, i, j):
"""
當去除停用詞後導致兩個字串的匹配度一樣時,從原文裡挑選出更適合的
Args:
- query: 使用者的輸入
- i: index 為 i 的 title
- j: index 為 j 的 title
Return: (target, index)
- target: 較適合的標題
- index : 該標題的 id
"""
raw1 = self.titles[i]
raw2 = self.titles[j]

r1 = fuzz.ratio(query, raw1)
r2 = fuzz.ratio(query, raw2)

if r1 > r2:
return (raw1,i)
else:
return (raw2,j)

def match(self, query, custom_title=None):
"""
讀入使用者 query,若語料庫中存在類似的句子,便回傳該句子與標號
Args:
- query: 使用者欲查詢的語句
- removeStopWords: 清除 stopwords
- custom_title: 使用者欲比對的問題集
"""
ratio = -1
target = ""
target_idx = -1

if self.cleanStopWords:
mQuery = [word for word in self.wordSegmentation(query)
if word not in self.stopwords]
mQuery = "".join(mQuery)
title_list = self.segTitles
else:
if custom_title is None:
title_list = self.titles
else:
title_list = custom_title
mQuery = query

for index,title in enumerate(title_list):

newRatio = fuzz.ratio(mQuery, title)

if newRatio > ratio:
ratio = newRatio
target = title
target_idx = index

elif self.cleanStopWords and newRatio == ratio:
target, target_idx = self.tieBreak(query,target_idx,index)

self.similarity = ratio
return target,target_idx
121 changes: 121 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import logging
import os

import jieba
import Taiba

class Matcher(object):

"""
比對使用者輸入的句子與目標語料集,
回傳語料集中最相似的一個句子。
"""

def __init__(self, segLib="Taiba"):

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
self.titles = [] # 欲進行匹配的所有標題
self.segTitles = [] # 斷好詞的標題

self.stopwords = set()
self.similarity = 1.

if segLib == "Taiba":
self.useTaiba = True
else:
self.useTaiba = False

def jiebaCustomSetting(self, dict_path, usr_dict_path):

jieba.set_dictionary(dict_path)
with open(usr_dict_path, 'r', encoding='utf-8') as dic:
for word in dic:
jieba.add_word(word.strip('\n'))

def TaibaCustomSetting(self, usr_dict):

with open(usr_dict, 'r', encoding='utf-8') as dic:
for word in dic:
Taiba.add_word(word.strip('\n'))

def loadStopWords(self, path):
with open(path, 'r', encoding='utf-8') as sw:
for word in sw:
self.stopwords.add(word.strip('\n'))

def loadTitles(self, path):

with open(path,'r',encoding='utf-8') as data:
self.titles = [line.strip('\n') for line in data]

def match(self, query):
"""
讀入使用者 query,若語料庫中存在相同的句子,便回傳該句子與標號
Args:
- query: 使用者的輸入
Return: (title,index)
- title: 最為相似的標題
- 該標題的索引編號
"""
result = None
for index, title in enumerate(self.titles):
if title == query:
return title,index

def getSimilarity(self):
return self.similarity

def wordSegmentation(self, string):

if self.useTaiba:
return Taiba.lcut(string,CRF=True)
else:
return jieba.cut(string,cut_all=True)

def TitlesSegmentation(self, cleanStopwords=False):

"""
將 self.titles 斷詞後的結果輸出,並儲存於 self.segTitles
Args:
- cleanStopwords: 是否要清除標題中的停用詞
"""

logging.info("正準備將 titles 斷詞")

count = 0

if not os.path.exists('data/SegTitles.txt'):

self.segTitles = []
for title in self.titles:

if cleanStopwords:
clean = [word for word in self.wordSegmentation(title)
if word not in self.stopwords]
self.segTitles.append(clean)
else:
self.segTitles.append(self.wordSegmentation(title))

count += 1
if count % 1000 == 0:
logging.info("已斷詞完前 %d 篇文章" % count)

with open('data/SegTitles.txt','w',encoding="utf-8") as seg_title:
for title in self.segTitles:
seg_title.write(' '.join(title) + '\n')
logging.info("完成標題斷詞,結果已暫存至 data/SegTitles.txt")
else:
logging.info("偵測到先前的標題斷詞結果,讀取中...")
with open('data/SegTitles.txt','r',encoding="utf-8") as seg_title:
for line in seg_title:
line = line.strip('\n')
seg = line.split()

if cleanStopwords:
seg = [word for word in seg
if word not in self.stopwords]
self.segTitles.append(seg)
logging.info("%d 個標題已完成載入" % len(self.segTitles))
14 changes: 14 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/vectorMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from gensim import models,corpora
from sklearn import svm

from . import Matcher

class VectorMatcher(Matcher):

def __init__(self):

self.vecModel = None
#TODO

def match(self, query):
#TODO
6 changes: 6 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/wordBagMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .matcher import Matcher

# 此匹配模組已優化為 WordWeightMatcher

class WordBagMatcher(Matcher):
pass
108 changes: 108 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/wordWeightMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import math
import logging

import gensim

from collections import defaultdict

from .matcher import Matcher

class WordWeightMatcher(Matcher):

"""
採用詞權重來比對短語相似度
"""

def __init__(self, segLib="Taiba"):

super().__init__(segLib)

self.wordDictionary = defaultdict(int) # 保存每個詞的出現次數
self.totalWords = 0 # 詞總數
self.wordWeights = defaultdict(int) # 保存每個詞的權重

def initialize(self):
logging.info("初始化模塊中...")
self.TitlesSegmentation()
self.buildWordDictionary()
self.loadStopWords("data/stopwords/chinese_sw.txt")
self.loadStopWords("data/stopwords/specialMarks.txt")
self.calculateWeight()
logging.info("初始化完成 :>")

def buildWordDictionary(self):

for title in self.segTitles:
for word in title:
self.wordDictionary[word] += 1
self.totalWords += 1
logging.info("詞記數完成")

def buildWordBag(self):
dictionary = gensim.corpora.Dictionary(self.titles)

def calculateWeight(self):
# 算法的數學推導請見:
# 非主流自然语言处理——遗忘算法系列(四):改进TF-IDF权重公式
# http://www.52nlp.cn/forgetnlp4
# 此處儲存的 weight 為後項,即 -1 * log(N/T)

for word,count in self.wordDictionary.items():
self.wordWeights[word] = -1 * math.log10(count/self.totalWords)
logging.info("詞統計完成")

def getCooccurrence(self, q1, q2):

#TODO NEED OPTIMIZE!!!!
res = []
for word in q1:
if word in q2:
res.append(word)
return res

def getWordWeight(self, word, n=1):
#TODO FIX N
return(n * self.wordWeights[word])

def match(self, query, sort=False):

"""
讀入使用者 query,若語料庫中存在相同的句子,便回傳該句子與標號
"""

max_similarity = -1
target = ""
index = -1

segQuery = [word for word in self.wordSegmentation(query)
if word not in self.stopwords]

for index,title in enumerate(self.segTitles):

if len(title) == 0:
continue

allWordsWeight = 0.
coWordsWeight = 0.

coWords = self.getCooccurrence(title, segQuery)

for word in coWords:
coWordsWeight += self.getWordWeight(word)

for word in title:
if word not in coWords:
allWordsWeight += self.getWordWeight(word)
for word in segQuery:
if word not in coWords:
allWordsWeight += self.getWordWeight(word)
similarity = coWordsWeight/allWordsWeight

if similarity > max_similarity:
max_similarity = similarity
target = title
target_idx = index

self.similarity = max_similarity * 100 #統一為百分制

return target,target_idx
5 changes: 5 additions & 0 deletions Chatbot/QuestionAnswering/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## 簡易問答

目前的 QA 是基於 [PTT-Push-Generator](https://github.com/zake7749/PTT-Push-Generator) 進行。

*注意:*目前仍未上傳 QA 的資料集,進行測試時請先關閉 QA 功能
3 changes: 3 additions & 0 deletions Chatbot/QuestionAnswering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import os
import sys
sys.path.append(os.path.dirname(__file__))
Loading

0 comments on commit 863aefa

Please sign in to comment.