[egs] fix bugs in Multi-database English LVCSR recipe (kaldi-asr#1785)

* Fixed bugs in Multi-database English LVCSR recipe * modified some headers
hainan-xv · Aug 4, 2017 · c4a7352 · c4a7352
1 parent 35faa4d
commit c4a7352
Show file tree

Hide file tree

Showing 15 changed files with 141 additions and 24 deletions.
diff --git a/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py b/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 # convert acronyms in swbd decode result to fisher convention
 # e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
 # en_4156 B 414.26 0.16 u

diff --git a/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py b/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 # convert acronyms in swbd decode result to fisher convention
 # e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
 # en_4156 B 414.26 0.16 u

diff --git a/egs/fisher_swbd/s5/local/format_acronyms_dict.py b/egs/fisher_swbd/s5/local/format_acronyms_dict.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 # convert acronyms in swbd dict to fisher convention
 # IBM to i._b._m.
 # BBC to b._b._c.

diff --git a/egs/fisher_swbd/s5/local/map_acronyms_transcripts.py b/egs/fisher_swbd/s5/local/map_acronyms_transcripts.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 # convert acronyms in swbd transcript to fisher convention
 # accoring to first two columns in the input acronyms mapping
 

diff --git a/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py b/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
+# convert acronyms in swbd decode result to fisher convention
+# e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
+# en_4156 B 414.26 0.16 u
+# en_4156 B 414.42 0.16 c
+# en_4156 B 414.58 0.16 l
+# en_4156 B 414.74 0.17 a
+
+import argparse,re
+__author__ = 'Minhua Wu'
+
+parser = argparse.ArgumentParser(description='format acronyms from a._b._c. to a b c')
+parser.add_argument('-i','--input', help='Input ctm file ',required=True)
+parser.add_argument('-o','--output',help='Output ctm file', required=True)
+args = parser.parse_args()
+
+fin = open(args.input,"r")
+fout = open(args.output, "w")
+
+for line in fin:
+    items = line.split()
+
+    if items[4].find(".") != -1:
+        letters = items[4].split("._")
+        acronym_period = round(float(items[3]), 2)
+        letter_slot = round(acronym_period / len(letters), 2)
+        time_start = round(float(items[2]), 2)
+        for l in letters[:-1]:
+            time = " %.2f %.2f " % (time_start, letter_slot)
+            fout.write(' '.join(items[:2])+ time + l + "\n")
+            time_start = time_start + letter_slot
+        last_slot = acronym_period - letter_slot * (len(letters) - 1)
+        time = " %.2f %.2f " % (time_start, last_slot)
+        letters[-1] = re.sub(r"\.'s", "'s", letters[-1])
+        letters[-1] = re.sub(r"\.s", "'s", letters[-1])        
+        fout.write(' '.join(items[:2])+ time + letters[-1].replace('.','') + "\n")
+    else:
+        fout.write(line)    
+
+
diff --git a/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py b/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
+# convert acronyms in swbd decode result to fisher convention
+# e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
+# en_4156 B 414.26 0.16 u
+# en_4156 B 414.42 0.16 c
+# en_4156 B 414.58 0.16 l
+# en_4156 B 414.74 0.17 a
+
+import argparse,re
+__author__ = 'Minhua Wu'
+
+parser = argparse.ArgumentParser(description='format acronyms from a._b._c. to a b c')
+parser.add_argument('-i','--input', help='Input ctm file ',required=True)
+parser.add_argument('-o','--output',help='Output ctm file', required=True)
+args = parser.parse_args()
+
+fin = open(args.input,"r")
+fout = open(args.output, "w")
+
+for line in fin:
+    items = line.split()
+
+    if items[4].find(".") != -1:
+        letters = items[4].split("._")
+        acronym_period = round(float(items[3]), 2)
+        letter_slot = round(acronym_period / len(letters), 2)
+        time_start = round(float(items[2]), 2)
+        for l in letters[:-1]:
+            time = " %.2f %.2f " % (time_start, letter_slot)
+            fout.write(' '.join(items[:2])+ time + l + ".\n")
+            time_start = time_start + letter_slot
+        last_slot = acronym_period - letter_slot * (len(letters) - 1)
+        time = " %.2f %.2f " % (time_start, last_slot)
+        letters[-1] = re.sub(r"\.'s", "'s", letters[-1])
+        letters[-1] = re.sub(r"\.s", "'s", letters[-1])        
+        fout.write(' '.join(items[:2])+ time + letters[-1] + "\n")
+    else:
+        fout.write(line)    
+
+
diff --git a/egs/multi_en/s5/local/normalize_transcript.py b/egs/multi_en/s5/local/normalize_transcript.py
@@ -29,7 +29,10 @@ def main():
     with open(sys.argv[1], 'r') as f:
         for line in f.readlines():
             chunks = line.split(' ')
-            sys.stdout.write(chunks[0] + ' ' + normalize(' '.join(chunks[1:])))
+            if len(chunks) > 1:
+                sys.stdout.write(chunks[0].strip() + ' ' + normalize(' '.join(chunks[1:])))
+            else:
+                sys.stdout.write(chunks[0].strip() + '\n')
 
 if __name__ == '__main__':
     main()
diff --git a/egs/multi_en/s5/local/score_sclite.sh b/egs/multi_en/s5/local/score_sclite.sh
@@ -46,7 +46,9 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
-name=`basename $data`; # e.g. eval2000
+# the structure of data dirs in this recipe is like data/eval2000/test
+data_dir=`dirname $data`;
+name=`basename $data_dir`; # e.g. eval2000
 
 mkdir -p $dir/scoring/log
 

diff --git a/egs/multi_en/s5/local/swbd_format_acronyms_dict.py b/egs/multi_en/s5/local/swbd_format_acronyms_dict.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python
 
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 ###########################################################################################
 # This script was copied from egs/fisher_swbd/s5/local/format_acronyms_dict.py
 # The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21

diff --git a/egs/multi_en/s5/local/swbd_map_acronyms_transcripts.py b/egs/multi_en/s5/local/swbd_map_acronyms_transcripts.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python
 
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 ###########################################################################################
 # This script was copied from egs/fisher_swbd/s5/local/map_acronyms_transcripts.py
 # The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21

diff --git a/egs/multi_en/s5/local/train_lms.sh b/egs/multi_en/s5/local/train_lms.sh
@@ -28,28 +28,12 @@ done
 
 dir=data/local/lm
 mkdir -p $dir
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd ../../../tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz ||
-     wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-mkdir -p $dir
 
+kaldi_lm=`which train_lm.sh`
+if [ ! -x $kaldi_lm ]; then
+  echo "train_lm.sh is not found. Look at tools/extra/install_kaldi_lm.sh"
+  exit 1
+fi
 
 cleantext=$dir/text.no_oov
 

diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
@@ -17,6 +17,12 @@ wsj1=
 eval2000=
 rt03=
 
+# check for kaldi_lm
+which get_word_map.pl > /dev/null
+if [ $? -ne 0 ]; then
+  echo "This recipe requires installation of tools/kaldi_lm. Please run extras/kaldi_lm.sh in tools/" && exit 1;
+fi
+
 # preset paths
 case $(hostname -d) in
   clsp.jhu.edu)
@@ -216,7 +222,7 @@ fi
 
 # reestimate LM with silprobs
 if [ $stage -le 15 ]; then
-#  steps/get_prons.sh --cmd "$train_cmd" data/$multi/tri3 data/lang_nosp exp/$multi/tri3
+  steps/get_prons.sh --cmd "$train_cmd" data/$multi/tri3 data/lang_nosp exp/$multi/tri3
   utils/dict_dir_add_pronprobs.sh --max-normalize true \
     data/local/dict_nosp exp/$multi/tri3/pron_counts_nowb.txt \
     exp/$multi/tri3/sil_counts_nowb.txt exp/$multi/tri3/pron_bigram_counts_nowb.txt data/local/dict

diff --git a/egs/swbd/s5c/local/format_acronyms_dict.py b/egs/swbd/s5c/local/format_acronyms_dict.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 # convert acronyms in swbd dict to fisher convention
 # IBM to i._b._m.
 # BBC to b._b._c.

diff --git a/egs/swbd/s5c/local/map_acronyms_ctm.py b/egs/swbd/s5c/local/map_acronyms_ctm.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 # convert acronyms in swbd decode result
 # e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
 # en_4156 B 414.26 0.16 u

diff --git a/egs/swbd/s5c/local/map_acronyms_transcripts.py b/egs/swbd/s5c/local/map_acronyms_transcripts.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
 # convert acronyms in swbd transcript to fisher convention
 # accoring to first two columns in the input acronyms mapping