-
Notifications
You must be signed in to change notification settings - Fork 4
/
kirra_libs.py
33 lines (29 loc) · 924 Bytes
/
kirra_libs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os
from spacy.lang.id import Indonesian
import fnmatch
def getAllFileinFolder(folderpath):
filelist = []
for dirpath, dirs, files in os.walk(folderpath):
for filename in fnmatch.filter(files, '*.txt'):
filelist.append(dirpath + "/" + filename)
return filelist
def writedataa(list, thname):
file = open("sentence_rep_{}.txt".format(thname), "w");
for x in sorted(set(list)):
# for x in list:
# hasil = x.replace('"','').replace("#","").replace(" ","" )
file.write(x + "\n")
file.close()
nlp = Indonesian()
def tokenize_and_stem(text):
text = u'{}'.format(text)
doc = nlp(text)
stems = [t.lemma_ for t in doc]
stems = [t.lower() for t in stems]
return stems
def tokenize_only(text):
text = u'{}'.format(text)
doc = nlp(text)
stems = [t.text for t in doc]
stems = [t.lower() for t in stems]
return stems