Skip to content

Commit

Permalink
[src,egs,scripts] Merging RNNLM-related changes which were in wrong b…
Browse files Browse the repository at this point in the history
…ranch (kaldi-asr#2092)
  • Loading branch information
hainan-xv authored and eginhard committed Jan 11, 2018
1 parent f096a99 commit da9a7fd
Show file tree
Hide file tree
Showing 19 changed files with 800 additions and 63 deletions.
19 changes: 13 additions & 6 deletions egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,25 @@ final_lm=ami_fsh.o3g.kn
LM=$final_lm.pr1-7

if [ $stage -le 3 ]; then
# for decode_set in dev; do
for decode_set in dev eval; do
basedir=exp/$mic/nnet3/tdnn_sp/
decode_dir=${basedir}/decode_${decode_set}

# Lattice rescoring
steps/lmrescore_rnnlm_lat.sh \
--cmd "$tfrnnlm_cmd --mem 16G" \
--rnnlm-ver tensorflow --weight $weight --max-ngram-order $ngram_order \
# pruned lattice rescoring
steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh \
--cmd "$tfrnnlm_cmd --mem 4G" \
--weight $weight --max-ngram-order $ngram_order \
data/lang_$LM $dir \
data/$mic/${decode_set}_hires ${decode_dir} \
${decode_dir}.unk.fast.tfrnnlm.lat.${ngram_order}gram.$weight &
${decode_dir}_tfrnnlm_lat_${ngram_order}gram &

# Lattice rescoring, unpruned (slow) version
# steps/tfrnnlm/lmrescore_rnnlm_lat.sh \
# --cmd "$tfrnnlm_cmd --mem 4G" \
# --weight $weight --max-ngram-order $ngram_order \
# data/lang_$LM $dir \
# data/$mic/${decode_set}_hires ${decode_dir} \
# ${decode_dir}_lat_${ngram_order}gram_unpruned &

done
fi
Expand Down
10 changes: 8 additions & 2 deletions egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-orde
# if it's set, it merges histories in the lattice if they share
# the same ngram history and this prevents the lattice from
# exploding exponentially
pruned_rescore=true

. cmd.sh
. utils/parse_options.sh
Expand Down Expand Up @@ -95,12 +96,17 @@ fi

if [ $stage -le 4 ] && $run_rescore; then
echo "$0: Perform lattice-rescoring on $ac_model_dir"
LM=sw1_fsh_fg
LM=sw1_fsh_fg # using the 4-gram const arpa file as old lm
# LM=sw1_tg # if using the original 3-gram G.fst as old lm
pruned=
if $pruned_rescore; then
pruned=_pruned
fi
for decode_set in eval2000; do
decode_dir=${ac_model_dir}/decode_${decode_set}_${LM}_looped

# Lattice rescoring
rnnlm/lmrescore.sh \
rnnlm/lmrescore$pruned.sh \
--cmd "$decode_cmd --mem 4G" \
--weight 0.5 --max-ngram-order $ngram_order \
data/lang_$LM $dir \
Expand Down
19 changes: 5 additions & 14 deletions egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@
cmd=run.pl
skip_scoring=false
max_ngram_order=4
N=10
inv_acwt=12
weight=1.0 # Interpolation weight for RNNLM.
# End configuration section.
acwt=0.1
weight=0.5 # Interpolation weight for RNNLM.
rnnlm_ver=
#layer_string=
# End configuration section.

echo "$0 $@" # Print the command line for logging

Expand Down Expand Up @@ -56,11 +54,6 @@ if [ "$rnnlm_ver" == "cuedrnnlm" ]; then
first_arg=$rnnlm_dir/rnn.wlist
fi

if [ "$rnnlm_ver" == "tensorflow" ]; then
rescoring_binary="lattice-lmrescore-tf-rnnlm"
first_arg="$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final"
fi

oldlm=$oldlang/G.fst
if [ -f $oldlang/G.carpa ]; then
oldlm=$oldlang/G.carpa
Expand All @@ -70,7 +63,7 @@ elif [ ! -f $oldlm ]; then
fi

[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
[ ! -f $rnnlm_dir/rnnlm ] && [ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1;
[ ! -f $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1;
[ ! -f $rnnlm_dir/unk.probs ] &&\
echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1;
[ ! -f $oldlang/words.txt ] &&\
Expand All @@ -83,8 +76,6 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {

oldlm_command="fstproject --project_output=true $oldlm |"

acwt=`perl -e "print (1.0/$inv_acwt);"`

mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir
Expand Down Expand Up @@ -112,7 +103,7 @@ if ! $skip_scoring ; then
[ ! -x local/score.sh ] && echo $err_msg && exit 1;
local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
echo "Not scoring because requested so..."
echo "$0: Not scoring because --skip-scoring was specified."
fi

exit 0;
101 changes: 101 additions & 0 deletions egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/bin/bash

# Copyright 2015 Guoguo Chen
# 2017 Hainan Xu
# Apache 2.0

# This script rescores lattices with RNNLM trained with TensorFlow.
# A faster and more accurate version of the algorithm is at
# steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh which is prefered
# One example recipe of this script is at egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh

# Begin configuration section.
cmd=run.pl
skip_scoring=false
max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order
# if it's set, it merges histories in the lattice if they share
# the same ngram history and this prevents the lattice from
# exploding exponentially. Details of the n-gram approximation
# method are described in section 2.3 of the paper
# http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
weight=0.5 # Interpolation weight for RNNLM.
# End configuration section.

echo "$0 $@" # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 5 ]; then
echo "Does language model rescoring of lattices (remove old LM, add new LM)"
echo "with TensorFlow RNNLM."
echo ""
echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
echo " <data-dir> <input-decode-dir> <output-decode-dir>"
echo " e.g.: $0 data/lang_tg data/tensorflow_lstm data/test \\"
echo " exp/tri3/test_tg exp/tri3/test_tfrnnlm"
echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
rnnlm_dir=$2
data=$3
indir=$4
outdir=$5

oldlm=$oldlang/G.fst
if [ -f $oldlang/G.carpa ]; then
oldlm=$oldlang/G.carpa
elif [ ! -f $oldlm ]; then
echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
exit 1;
fi

echo "$0: using $oldlm as old LM"

[ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing tf model folder $rnnlm_dir/rnnlm" && exit 1;

for f in $rnnlm_dir/unk.probs $oldlang/words.txt $indir/lat.1.gz; do
[ ! -f $f ] && echo "$0: Missing file $f" && exit 1
done

awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
|| exit 1;

oldlm_command="fstproject --project_output=true $oldlm |"

mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir

oldlm_weight=`perl -e "print -1.0 * $weight;"`
if [ "$oldlm" == "$oldlang/G.fst" ]; then
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
lattice-lmrescore --lm-scale=$oldlm_weight \
"ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \
lattice-lmrescore-tf-rnnlm --lm-scale=$weight \
--max-ngram-order=$max_ngram_order \
$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
"ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
else
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \
"ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:- \| \
lattice-lmrescore-tf-rnnlm --lm-scale=$weight \
--max-ngram-order=$max_ngram_order \
$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
"ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
fi
if ! $skip_scoring ; then
err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
[ ! -x local/score.sh ] && echo $err_msg && exit 1;
local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
echo "$0: Not scoring because --skip-scoring was specified."
fi

exit 0;

94 changes: 94 additions & 0 deletions egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/bin/bash

# Copyright 2015 Guoguo Chen
# 2017 Hainan Xu
# Apache 2.0

# This script rescores lattices with RNNLM trained with TensorFlow.
# It uses a pruned algorithm to speed up the runtime and improve the accuracy.
# which is an improved version over steps/tfrnnlm/lmrescore_rnnlm_lat.sh,
# which uses the exact same interface
# The details of the pruning algorithm is described in
# http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
# One example recipe of this script is at egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh

# Begin configuration section.
cmd=run.pl
skip_scoring=false
max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order
# if it's set, it merges histories in the lattice if they share
# the same ngram history and this prevents the lattice from
# exploding exponentially. Details of the n-gram approximation
# method are described in section 2.3 of the paper
# http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
acwt=0.1
weight=0.5 # Interpolation weight for RNNLM.
# End configuration section.

echo "$0 $@" # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 5 ]; then
echo "Does language model rescoring of lattices (remove old LM, add new LM)"
echo "with RNNLM."
echo ""
echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
echo " <data-dir> <input-decode-dir> <output-decode-dir>"
echo " e.g.: $0 data/lang_tg data/tensorflow_lstm data/test \\"
echo " exp/tri3/test_tg exp/tri3/test_tfrnnlm"
echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
rnnlm_dir=$2
data=$3
indir=$4
outdir=$5

oldlm=$oldlang/G.fst
carpa_option=

if [ -f $oldlang/G.carpa ]; then
oldlm=$oldlang/G.carpa
carpa_option="--use-const-arpa=true"
elif [ ! -f $oldlm ]; then
echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
exit 1;
fi

echo "$0: using $oldlm as old LM"

[ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing tf model folder $rnnlm_dir/rnnlm" && exit 1;

for f in $rnnlm_dir/unk.probs $oldlang/words.txt $indir/lat.1.gz; do
[ ! -f $f ] && echo "$0: Missing file $f" && exit 1
done

awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
|| exit 1;

mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir

$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
lattice-lmrescore-tf-rnnlm-pruned --lm-scale=$weight \
--acoustic-scale=$acwt --max-ngram-order=$max_ngram_order \
$carpa_option $oldlm $oldlang/words.txt \
$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final "$rnnlm_dir/rnnlm" \
"ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;

if ! $skip_scoring ; then
err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
[ ! -x local/score.sh ] && echo $err_msg && exit 1;
local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
echo "$0: Not scoring because --skip-scoring was specified."
fi

exit 0;
11 changes: 7 additions & 4 deletions scripts/rnnlm/lmrescore.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
# Apache 2.0

# This script rescores lattices with KALDI RNNLM.
# It uses a simple n-gram approximation to limit the search space;
# A faster and more accurate way to rescore is at rnnlm/lmrescore_pruned.sh
# which is more prefered

# Begin configuration section.
cmd=run.pl
Expand All @@ -14,14 +17,14 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-
# the same ngram history and this prevents the lattice from
# exploding exponentially. Details of the n-gram approximation
# method are described in section 2.3 of the paper
# http://www.cs.jhu.edu/~hxu/tf.pdf
# http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf

weight=0.5 # Interpolation weight for RNNLM.
normalize=false # If true, we add a normalization step to the output of the RNNLM
# so that it adds up to *exactly* 1. Note that this is not necessary
# as in our RNNLM setup, a properly trained network would automatically
# have its normalization term close to 1. The details of this
# could be found at http://www.cs.jhu.edu/~hxu/rnnlm.pdf
# could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf

# End configuration section.

Expand Down Expand Up @@ -109,11 +112,11 @@ else
fi

if ! $skip_scoring ; then
err_msg="Not scoring because local/score.sh does not exist or not executable."
err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
[ ! -x local/score.sh ] && echo $err_msg && exit 1;
local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
echo "Not scoring because requested so..."
echo "$0: Not scoring because --skip-scoring was specified."
fi

exit 0;
Loading

0 comments on commit da9a7fd

Please sign in to comment.