Skip to content

Commit

Permalink
[egs] fix bugs in Multi-database English LVCSR recipe (kaldi-asr#1785)
Browse files Browse the repository at this point in the history
* Fixed bugs in Multi-database English LVCSR recipe
* modified some headers
  • Loading branch information
xiaohui-zhang authored and jtrmal committed Aug 4, 2017
1 parent 35faa4d commit c4a7352
Show file tree
Hide file tree
Showing 15 changed files with 141 additions and 24 deletions.
4 changes: 4 additions & 0 deletions egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd decode result to fisher convention
# e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
# en_4156 B 414.26 0.16 u
Expand Down
4 changes: 4 additions & 0 deletions egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd decode result to fisher convention
# e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
# en_4156 B 414.26 0.16 u
Expand Down
4 changes: 4 additions & 0 deletions egs/fisher_swbd/s5/local/format_acronyms_dict.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd dict to fisher convention
# IBM to i._b._m.
# BBC to b._b._c.
Expand Down
4 changes: 4 additions & 0 deletions egs/fisher_swbd/s5/local/map_acronyms_transcripts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd transcript to fisher convention
# accoring to first two columns in the input acronyms mapping

Expand Down
44 changes: 44 additions & 0 deletions egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd decode result to fisher convention
# e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
# en_4156 B 414.26 0.16 u
# en_4156 B 414.42 0.16 c
# en_4156 B 414.58 0.16 l
# en_4156 B 414.74 0.17 a

import argparse,re
__author__ = 'Minhua Wu'

parser = argparse.ArgumentParser(description='format acronyms from a._b._c. to a b c')
parser.add_argument('-i','--input', help='Input ctm file ',required=True)
parser.add_argument('-o','--output',help='Output ctm file', required=True)
args = parser.parse_args()

fin = open(args.input,"r")
fout = open(args.output, "w")

for line in fin:
items = line.split()

if items[4].find(".") != -1:
letters = items[4].split("._")
acronym_period = round(float(items[3]), 2)
letter_slot = round(acronym_period / len(letters), 2)
time_start = round(float(items[2]), 2)
for l in letters[:-1]:
time = " %.2f %.2f " % (time_start, letter_slot)
fout.write(' '.join(items[:2])+ time + l + "\n")
time_start = time_start + letter_slot
last_slot = acronym_period - letter_slot * (len(letters) - 1)
time = " %.2f %.2f " % (time_start, last_slot)
letters[-1] = re.sub(r"\.'s", "'s", letters[-1])
letters[-1] = re.sub(r"\.s", "'s", letters[-1])
fout.write(' '.join(items[:2])+ time + letters[-1].replace('.','') + "\n")
else:
fout.write(line)


44 changes: 44 additions & 0 deletions egs/multi_en/s5/local/format_acronyms_ctm_rt03.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd decode result to fisher convention
# e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
# en_4156 B 414.26 0.16 u
# en_4156 B 414.42 0.16 c
# en_4156 B 414.58 0.16 l
# en_4156 B 414.74 0.17 a

import argparse,re
__author__ = 'Minhua Wu'

parser = argparse.ArgumentParser(description='format acronyms from a._b._c. to a b c')
parser.add_argument('-i','--input', help='Input ctm file ',required=True)
parser.add_argument('-o','--output',help='Output ctm file', required=True)
args = parser.parse_args()

fin = open(args.input,"r")
fout = open(args.output, "w")

for line in fin:
items = line.split()

if items[4].find(".") != -1:
letters = items[4].split("._")
acronym_period = round(float(items[3]), 2)
letter_slot = round(acronym_period / len(letters), 2)
time_start = round(float(items[2]), 2)
for l in letters[:-1]:
time = " %.2f %.2f " % (time_start, letter_slot)
fout.write(' '.join(items[:2])+ time + l + ".\n")
time_start = time_start + letter_slot
last_slot = acronym_period - letter_slot * (len(letters) - 1)
time = " %.2f %.2f " % (time_start, last_slot)
letters[-1] = re.sub(r"\.'s", "'s", letters[-1])
letters[-1] = re.sub(r"\.s", "'s", letters[-1])
fout.write(' '.join(items[:2])+ time + letters[-1] + "\n")
else:
fout.write(line)


5 changes: 4 additions & 1 deletion egs/multi_en/s5/local/normalize_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ def main():
with open(sys.argv[1], 'r') as f:
for line in f.readlines():
chunks = line.split(' ')
sys.stdout.write(chunks[0] + ' ' + normalize(' '.join(chunks[1:])))
if len(chunks) > 1:
sys.stdout.write(chunks[0].strip() + ' ' + normalize(' '.join(chunks[1:])))
else:
sys.stdout.write(chunks[0].strip() + '\n')

if __name__ == '__main__':
main()
4 changes: 3 additions & 1 deletion egs/multi_en/s5/local/score_sclite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
[ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

name=`basename $data`; # e.g. eval2000
# the structure of data dirs in this recipe is like data/eval2000/test
data_dir=`dirname $data`;
name=`basename $data_dir`; # e.g. eval2000

mkdir -p $dir/scoring/log

Expand Down
3 changes: 3 additions & 0 deletions egs/multi_en/s5/local/swbd_format_acronyms_dict.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

###########################################################################################
# This script was copied from egs/fisher_swbd/s5/local/format_acronyms_dict.py
# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
Expand Down
3 changes: 3 additions & 0 deletions egs/multi_en/s5/local/swbd_map_acronyms_transcripts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

###########################################################################################
# This script was copied from egs/fisher_swbd/s5/local/map_acronyms_transcripts.py
# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
Expand Down
26 changes: 5 additions & 21 deletions egs/multi_en/s5/local/train_lms.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,28 +28,12 @@ done

dir=data/local/lm
mkdir -p $dir
export LC_ALL=C # You'll get errors about things being not sorted, if you
# have a different locale.
export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
( # First make sure the kaldi_lm toolkit is installed.
cd ../../../tools || exit 1;
if [ -d kaldi_lm ]; then
echo Not installing the kaldi_lm toolkit since it is already there.
else
echo Downloading and installing the kaldi_lm tools
if [ ! -f kaldi_lm.tar.gz ]; then
wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz ||
wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
fi
tar -xvzf kaldi_lm.tar.gz || exit 1;
cd kaldi_lm
make || exit 1;
echo Done making the kaldi_lm tools
fi
) || exit 1;

mkdir -p $dir

kaldi_lm=`which train_lm.sh`
if [ ! -x $kaldi_lm ]; then
echo "train_lm.sh is not found. Look at tools/extra/install_kaldi_lm.sh"
exit 1
fi

cleantext=$dir/text.no_oov

Expand Down
8 changes: 7 additions & 1 deletion egs/multi_en/s5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ wsj1=
eval2000=
rt03=

# check for kaldi_lm
which get_word_map.pl > /dev/null
if [ $? -ne 0 ]; then
echo "This recipe requires installation of tools/kaldi_lm. Please run extras/kaldi_lm.sh in tools/" && exit 1;
fi

# preset paths
case $(hostname -d) in
clsp.jhu.edu)
Expand Down Expand Up @@ -216,7 +222,7 @@ fi

# reestimate LM with silprobs
if [ $stage -le 15 ]; then
# steps/get_prons.sh --cmd "$train_cmd" data/$multi/tri3 data/lang_nosp exp/$multi/tri3
steps/get_prons.sh --cmd "$train_cmd" data/$multi/tri3 data/lang_nosp exp/$multi/tri3
utils/dict_dir_add_pronprobs.sh --max-normalize true \
data/local/dict_nosp exp/$multi/tri3/pron_counts_nowb.txt \
exp/$multi/tri3/sil_counts_nowb.txt exp/$multi/tri3/pron_bigram_counts_nowb.txt data/local/dict
Expand Down
4 changes: 4 additions & 0 deletions egs/swbd/s5c/local/format_acronyms_dict.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd dict to fisher convention
# IBM to i._b._m.
# BBC to b._b._c.
Expand Down
4 changes: 4 additions & 0 deletions egs/swbd/s5c/local/map_acronyms_ctm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd decode result
# e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
# en_4156 B 414.26 0.16 u
Expand Down
4 changes: 4 additions & 0 deletions egs/swbd/s5c/local/map_acronyms_transcripts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env python

# Copyright 2015 Minhua Wu
# Apache 2.0

# convert acronyms in swbd transcript to fisher convention
# accoring to first two columns in the input acronyms mapping

Expand Down

0 comments on commit c4a7352

Please sign in to comment.