-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessor.py
93 lines (69 loc) · 2.91 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
from nltk import ngrams
import unicodedata
KEEP_VN_CHAR = re.compile(
u"[_aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+")
VN_CHAR = "ẮẰẲẴẶĂẤẦẨẪẬÂÁÀÃẢẠĐẾỀỂỄỆÊÉÈẺẼẸÍÌỈĨỊỐỒỔỖỘÔỚỜỞỠỢƠÓÒÕỎỌỨỪỬỮỰƯÚÙỦŨỤÝỲỶỸỴ"
FILTER_FLOAT_NUM = re.compile('([0-9]*[.])?[0-9]+')
FILTER_SPECIAL_CHAR = re.compile('[^A-Za-z._' + VN_CHAR + VN_CHAR.lower() + ']+')
FILTER_LETTER = re.compile(' [a-zA-Z' + VN_CHAR + VN_CHAR.lower() + '] ')
FILTER_EPLISIS = re.compile('\.{2,}')
def nomalize_uni_string(row):
# row = row.lower()
#
# row = REMOVE_NUMBER.sub("", row)
#
# # Xóa dấu chấm, phẩy, hỏi ở cuối câu
# row = REMOVE_LAST_DOT.sub("", row)
#
# # Xoa cac ky tu dac biet
#
# row = re.sub(u"\p{P}+","", row)
#
# # Xoa cac dau cach lien tuc
# row = REMOVE_DUPLICATE_SPACE.sub(" ", row)
#
# row = row.strip()
return row
def split_row_to_word(string):
return string.split(" ")
def split_preprocessor_row_to_word(string):
string = string.lower()
# gram_str = list(ngrams(string.split(), n))
# return [" ".join(gram).lower() for gram in gram_str]
return KEEP_VN_CHAR.findall(string)
def split_preprocessor_row_to_word_v2(row):
return preprocess_row(row).split(" ")
def preprocess_row(row):
# row = [row]
# # filter all empty element in list
# filter_empty = list(filter(None, row))
# filter_empty = list(filter(lambda name: name.strip(), filter_empty))
#
# # join '.' to seperate sentence
# string_row = ' '.join([('. ' if c[0].isupper() and c.count(" ") >= 6 else '') + c for c in filter_empty])
# remove special characters
filter_float_num = FILTER_FLOAT_NUM.sub('', row)
filter_special_char = FILTER_SPECIAL_CHAR.sub(' ',
unicodedata.normalize('NFC', filter_float_num))
# remove individual letter
filter_letter = FILTER_LETTER.sub(' ', filter_special_char.lower())
# remove '..."
filter_ellipsis = FILTER_EPLISIS.sub(' ', filter_letter)
return filter_ellipsis.strip()
def split_query_to_train_word(query):
return query.split(" ")
def split_tag_to_word(string):
return string.replace(";", " ").split(" ")
def split_preprocessor_title_to_word(title):
return preprocess_row(title).split(" ")
def get_train_word_from_title_and_tags(title, tags):
train_word = split_preprocessor_title_to_word(title)
if isinstance(tags, str):
train_word += split_tag_to_word(tags)
return train_word
def get_query_word_from_title_and_tags(title, tags):
query = preprocess_row(title)
if isinstance(tags, str):
query += " " + tags.replace(";", " ")
return query