-
Notifications
You must be signed in to change notification settings - Fork 0
/
Model_Training.py
104 lines (79 loc) · 3.86 KB
/
Model_Training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from __future__ import unicode_literals, print_function
import spacy
from spacy.training import Example
from PySide6 import QtCore
from PySide6.QtWidgets import QApplication
import os
import random
import time
class Model_Training(QtCore.QThread):
#This is the signal that will be emitted during the processing.
#By including int as an argument, it lets the signal know to expect
#an integer argument when emitting.
updateProgress = QtCore.Signal(int)
# truyền tham số là đường dẫn tới file train
def train_model(self,df_train_input):
# determine if application is a script file or frozen exe
# if getattr(sys, 'frozen', False):
# root_path = os.path.dirname(os.path.dirname(os.path.abspath(sys.executable)))
# elif __file__:
# root_path = os.path.dirname(os.path.abspath(__file__))
model = os.path.abspath('') + '\\en_core_web_lg\\en_core_web_lg-3.1.0'
output_dir= os.path.abspath('') + '\\ner'
# train_file_dir = os.path.abspath('') + '\\Train_data\\Train_data.xlsx'
n_iter=100
# start_time = time.time()
if model is not None:
nlp = spacy.load(model)
# print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en')
# print("Created blank 'en' model")
#set up the pipeline
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# set up training data
df_training = df_train_input[df_train_input['Trained'] != 'X']
df_training = df_training.reset_index(drop=True)
if len(df_training) > 0:
training_data = []
# chuẩn bị dữ liệu train
for idx in range(len(df_training)):
entity_name = df_training.loc[idx, "Entity name"]
entity_label = df_training.loc[idx, "Entity label"]
training_enity = (entity_name, {'entities': [(0, len(entity_name), entity_label)]})
training_data.append(training_enity)
# add label
for _, annotations in training_data:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.create_optimizer()
for itn in range(n_iter):
# emit signal to main_GUI
QApplication.processEvents()
self.updateProgress.emit(itn + 1)
time.sleep(0.2)
random.shuffle(training_data)
losses = {}
for batch in spacy.util.minibatch(training_data, size=2):
for text, annotations in batch:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update(
[example],
drop=0.5,
sgd=optimizer,
losses=losses)
# print("--- %s seconds ---" % round(time.time() - start_time, 2))
#Save model
if output_dir is not None:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
nlp.to_disk(output_dir)
df_train_input['Trained'] = 'X'
return df_train_input