[scripts] Apply encoding fix of kaldi-asr#2676 to make_lexicon_fst_si…

…lprobs.py (kaldi-asr#2680)
Skaiste · Sep 4, 2018 · 1b9f792 · 1b9f792
1 parent 76cb53a
commit 1b9f792
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 2 deletions.
diff --git a/egs/wsj/s5/utils/lang/make_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_lexicon_fst.py
@@ -70,8 +70,8 @@ def read_lexiconp(filename):
     found_large_pronprobs = False
     # See the comment near the top of this file, RE why we use latin-1.
     with open(filename, 'r', encoding='latin-1') as f:
+        whitespace = re.compile("[ \t]+")
         for line in f:
-            whitespace = re.compile("[ \t]+")
             a = whitespace.split(line.strip())
             if len(a) < 2:
                 print("{0}: error: found bad line '{1}' in lexicon file {2} ".format(

diff --git a/egs/wsj/s5/utils/lang/make_lexicon_fst_silprob.py b/egs/wsj/s5/utils/lang/make_lexicon_fst_silprob.py
@@ -8,6 +8,7 @@
 import os
 import sys
 import math
+import re
 
 # The use of latin-1 encoding does not preclude reading utf-8.  latin-1
 # encoding means "treat words as sequences of bytes", and it is compatible
@@ -79,8 +80,9 @@ def read_silprobs(filename):
     nonsilendcorrection = -1
     siloverallprob = -1
     with open(filename, 'r', encoding='latin-1') as f:
+        whitespace = re.compile("[ \t]+")
         for line in f:
-            a = line.split()
+            a = whitespace.split(line.strip())
             if len(a) != 2:
                 print("{0}: error: found bad line '{1}' in silprobs file {1} ".format(
                     sys.argv[0], line.strip(), filename), file=sys.stderr)