From af7e85a38a03607f864016789e69260dff12b536 Mon Sep 17 00:00:00 2001 From: SeanLee97 Date: Fri, 14 Sep 2018 22:27:48 +0800 Subject: [PATCH] fixed some bug --- xmnlp/postag/postag.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/xmnlp/postag/postag.py b/xmnlp/postag/postag.py index dc90840..01afbdf 100644 --- a/xmnlp/postag/postag.py +++ b/xmnlp/postag/postag.py @@ -92,22 +92,26 @@ def seg(self, sent): continue if R.zh.match(s): for w in list(self.dag.seg(s)): - yield w + if len(w.strip()) > 0: + yield w else: tmp = R.skip.split(s) for x in tmp: if R.skip.match(x): - yield x + if len(x.strip()) > 0: + yield x else: x = x.replace(' ','') endigts = R.endigt.findall(x) parts = re.split(r'[0-9]+\.?[0-9]+|[0-9]+|[a-zA-Z]+', x) if len(endigts) > 0: for w, t in self.re_decode(parts, endigts, False): - yield w + if len(w.strip()) > 0: + yield w else: for xx in x: - yield xx + if len(xx.strip()) > 0: + yield xx def tag(self, sent): for s in R.zh.split(sent): s = s.strip() @@ -117,19 +121,23 @@ def tag(self, sent): continue if R.zh.match(s): for w,t in self.dag.tag(s): - yield w, t + if len(w.strip()) > 0: + yield w, t else: tmp = R.skip.split(s) for x in tmp: if R.skip.match(x): - yield x + if len(x.strip()) > 0: + yield x else: x = x.replace(' ', '') endigts = R.endigt.findall(x) parts = re.split(r'[0-9]+\.?[0-9]+|[0-9]+|[a-zA-Z]+', x) if len(endigts) > 0: for w, t in self.re_decode(parts, endigts, True): - yield w, t + if len(w.strip()) > 0: + yield w, t else: for xx in x: - yield xx, 'un' + if len(xx.strip()) > 0: + yield xx, 'un'