[src,egs,scripts] Merging RNNLM-related changes which were in wrong b…

…ranch (kaldi-asr#2092)
eginhard · Jan 11, 2018 · da9a7fd · da9a7fd
1 parent f096a99
commit da9a7fd
Show file tree

Hide file tree

Showing 19 changed files with 800 additions and 63 deletions.
diff --git a/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh b/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh
@@ -34,18 +34,25 @@ final_lm=ami_fsh.o3g.kn
 LM=$final_lm.pr1-7
 
 if [ $stage -le 3 ]; then
-#  for decode_set in dev; do
   for decode_set in dev eval; do
     basedir=exp/$mic/nnet3/tdnn_sp/
     decode_dir=${basedir}/decode_${decode_set}
 
-    # Lattice rescoring
-    steps/lmrescore_rnnlm_lat.sh \
-      --cmd "$tfrnnlm_cmd --mem 16G" \
-      --rnnlm-ver tensorflow  --weight $weight --max-ngram-order $ngram_order \
+# pruned lattice rescoring
+    steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh \
+      --cmd "$tfrnnlm_cmd --mem 4G" \
+      --weight $weight --max-ngram-order $ngram_order \
       data/lang_$LM $dir \
       data/$mic/${decode_set}_hires ${decode_dir} \
-      ${decode_dir}.unk.fast.tfrnnlm.lat.${ngram_order}gram.$weight  &
+      ${decode_dir}_tfrnnlm_lat_${ngram_order}gram  &
+
+# Lattice rescoring, unpruned (slow) version
+#    steps/tfrnnlm/lmrescore_rnnlm_lat.sh \
+#      --cmd "$tfrnnlm_cmd --mem 4G" \
+#      --weight $weight --max-ngram-order $ngram_order \
+#      data/lang_$LM $dir \
+#      data/$mic/${decode_set}_hires ${decode_dir} \
+#      ${decode_dir}_lat_${ngram_order}gram_unpruned  &
 
   done
 fi

diff --git a/egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1e.sh b/egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1e.sh
@@ -23,6 +23,7 @@ ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-orde
               # if it's set, it merges histories in the lattice if they share
               # the same ngram history and this prevents the lattice from 
               # exploding exponentially
+pruned_rescore=true
 
 . cmd.sh
 . utils/parse_options.sh
@@ -95,12 +96,17 @@ fi
 
 if [ $stage -le 4 ] && $run_rescore; then
   echo "$0: Perform lattice-rescoring on $ac_model_dir"
-  LM=sw1_fsh_fg
+  LM=sw1_fsh_fg # using the 4-gram const arpa file as old lm
+#  LM=sw1_tg # if using the original 3-gram G.fst as old lm
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
   for decode_set in eval2000; do
     decode_dir=${ac_model_dir}/decode_${decode_set}_${LM}_looped
 
     # Lattice rescoring
-    rnnlm/lmrescore.sh \
+    rnnlm/lmrescore$pruned.sh \
       --cmd "$decode_cmd --mem 4G" \
       --weight 0.5 --max-ngram-order $ngram_order \
       data/lang_$LM $dir \

diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
@@ -11,12 +11,10 @@
 cmd=run.pl
 skip_scoring=false
 max_ngram_order=4
-N=10
-inv_acwt=12
-weight=1.0  # Interpolation weight for RNNLM.
-# End configuration section.
+acwt=0.1
+weight=0.5  # Interpolation weight for RNNLM.
 rnnlm_ver=
-#layer_string=
+# End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -56,11 +54,6 @@ if [ "$rnnlm_ver" == "cuedrnnlm" ]; then
   first_arg=$rnnlm_dir/rnn.wlist
 fi
 
-if [ "$rnnlm_ver" == "tensorflow" ]; then
-  rescoring_binary="lattice-lmrescore-tf-rnnlm"
-  first_arg="$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final"
-fi
-
 oldlm=$oldlang/G.fst
 if [ -f $oldlang/G.carpa ]; then
   oldlm=$oldlang/G.carpa
@@ -70,7 +63,7 @@ elif [ ! -f $oldlm ]; then
 fi
 
 [ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
-[ ! -f $rnnlm_dir/rnnlm ] && [ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1;
+[ ! -f $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1;
 [ ! -f $rnnlm_dir/unk.probs ] &&\
   echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1;
 [ ! -f $oldlang/words.txt ] &&\
@@ -83,8 +76,6 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
 
 oldlm_command="fstproject --project_output=true $oldlm |"
 
-acwt=`perl -e "print (1.0/$inv_acwt);"`
-
 mkdir -p $outdir/log
 nj=`cat $indir/num_jobs` || exit 1;
 cp $indir/num_jobs $outdir
@@ -112,7 +103,7 @@ if ! $skip_scoring ; then
   [ ! -x local/score.sh ] && echo $err_msg && exit 1;
   local/score.sh --cmd "$cmd" $data $oldlang $outdir
 else
-  echo "Not scoring because requested so..."
+  echo "$0: Not scoring because --skip-scoring was specified."
 fi
 
 exit 0;
diff --git a/egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Copyright 2015  Guoguo Chen
+#           2017  Hainan Xu
+# Apache 2.0
+
+# This script rescores lattices with RNNLM trained with TensorFlow.
+# A faster and more accurate version of the algorithm is at
+# steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh which is prefered
+# One example recipe of this script is at egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh
+
+# Begin configuration section.
+cmd=run.pl
+skip_scoring=false
+max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order
+                  # if it's set, it merges histories in the lattice if they share
+                  # the same ngram history and this prevents the lattice from 
+                  # exploding exponentially. Details of the n-gram approximation
+                  # method are described in section 2.3 of the paper
+                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
+weight=0.5  # Interpolation weight for RNNLM.
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./utils/parse_options.sh
+
+if [ $# != 5 ]; then
+   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
+   echo "with TensorFlow RNNLM."
+   echo ""
+   echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
+   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo " e.g.: $0 data/lang_tg data/tensorflow_lstm data/test \\"
+   echo "                   exp/tri3/test_tg exp/tri3/test_tfrnnlm"
+   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
+   exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh;
+
+oldlang=$1
+rnnlm_dir=$2
+data=$3
+indir=$4
+outdir=$5
+
+oldlm=$oldlang/G.fst
+if [ -f $oldlang/G.carpa ]; then
+  oldlm=$oldlang/G.carpa
+elif [ ! -f $oldlm ]; then
+  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
+    exit 1;
+fi
+
+echo "$0: using $oldlm as old LM"
+
+[ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing tf model folder $rnnlm_dir/rnnlm" && exit 1;
+
+for f in $rnnlm_dir/unk.probs $oldlang/words.txt $indir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: Missing file $f" && exit 1
+done
+
+awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
+  print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
+  || exit 1;
+
+oldlm_command="fstproject --project_output=true $oldlm |"
+
+mkdir -p $outdir/log
+nj=`cat $indir/num_jobs` || exit 1;
+cp $indir/num_jobs $outdir
+
+oldlm_weight=`perl -e "print -1.0 * $weight;"`
+if [ "$oldlm" == "$oldlang/G.fst" ]; then
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore --lm-scale=$oldlm_weight \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:-  \| \
+    lattice-lmrescore-tf-rnnlm --lm-scale=$weight \
+    --max-ngram-order=$max_ngram_order \
+    $rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
+    "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+else
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:-  \| \
+    lattice-lmrescore-tf-rnnlm --lm-scale=$weight \
+    --max-ngram-order=$max_ngram_order \
+    $rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
+    "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+fi
+if ! $skip_scoring ; then
+  err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
+  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
+  local/score.sh --cmd "$cmd" $data $oldlang $outdir
+else
+  echo "$0: Not scoring because --skip-scoring was specified."
+fi
+
+exit 0;
+
diff --git a/egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh b/egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Copyright 2015  Guoguo Chen
+#           2017  Hainan Xu
+# Apache 2.0
+
+# This script rescores lattices with RNNLM trained with TensorFlow.
+# It uses a pruned algorithm to speed up the runtime and improve the accuracy.
+# which is an improved version over steps/tfrnnlm/lmrescore_rnnlm_lat.sh,
+# which uses the exact same interface
+# The details of the pruning algorithm is described in
+# http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
+# One example recipe of this script is at egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh
+
+# Begin configuration section.
+cmd=run.pl
+skip_scoring=false
+max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order
+                  # if it's set, it merges histories in the lattice if they share
+                  # the same ngram history and this prevents the lattice from 
+                  # exploding exponentially. Details of the n-gram approximation
+                  # method are described in section 2.3 of the paper
+                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
+acwt=0.1
+weight=0.5  # Interpolation weight for RNNLM.
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./utils/parse_options.sh
+
+if [ $# != 5 ]; then
+   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
+   echo "with RNNLM."
+   echo ""
+   echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
+   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo " e.g.: $0 data/lang_tg data/tensorflow_lstm data/test \\"
+   echo "                   exp/tri3/test_tg exp/tri3/test_tfrnnlm"
+   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
+   exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh;
+
+oldlang=$1
+rnnlm_dir=$2
+data=$3
+indir=$4
+outdir=$5
+
+oldlm=$oldlang/G.fst
+carpa_option=
+
+if [ -f $oldlang/G.carpa ]; then
+  oldlm=$oldlang/G.carpa
+  carpa_option="--use-const-arpa=true"
+elif [ ! -f $oldlm ]; then
+  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
+    exit 1;
+fi
+
+echo "$0: using $oldlm as old LM"
+
+[ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing tf model folder $rnnlm_dir/rnnlm" && exit 1;
+
+for f in $rnnlm_dir/unk.probs $oldlang/words.txt $indir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: Missing file $f" && exit 1
+done
+
+awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
+  print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
+  || exit 1;
+
+mkdir -p $outdir/log
+nj=`cat $indir/num_jobs` || exit 1;
+cp $indir/num_jobs $outdir
+
+$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+  lattice-lmrescore-tf-rnnlm-pruned --lm-scale=$weight \
+  --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order \
+  $carpa_option $oldlm $oldlang/words.txt \
+  $rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final "$rnnlm_dir/rnnlm" \
+  "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+
+if ! $skip_scoring ; then
+  err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
+  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
+  local/score.sh --cmd "$cmd" $data $oldlang $outdir
+else
+  echo "$0: Not scoring because --skip-scoring was specified."
+fi
+
+exit 0;
diff --git a/scripts/rnnlm/lmrescore.sh b/scripts/rnnlm/lmrescore.sh
@@ -5,6 +5,9 @@
 # Apache 2.0
 
 # This script rescores lattices with KALDI RNNLM.
+# It uses a simple n-gram approximation to limit the search space;
+# A faster and more accurate way to rescore is at rnnlm/lmrescore_pruned.sh
+# which is more prefered
 
 # Begin configuration section.
 cmd=run.pl
@@ -14,14 +17,14 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-
                   # the same ngram history and this prevents the lattice from 
                   # exploding exponentially. Details of the n-gram approximation
                   # method are described in section 2.3 of the paper
-                  # http://www.cs.jhu.edu/~hxu/tf.pdf
+                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
 
 weight=0.5  # Interpolation weight for RNNLM.
 normalize=false # If true, we add a normalization step to the output of the RNNLM
                 # so that it adds up to *exactly* 1. Note that this is not necessary
                 # as in our RNNLM setup, a properly trained network would automatically
                 # have its normalization term close to 1. The details of this
-                # could be found at http://www.cs.jhu.edu/~hxu/rnnlm.pdf
+                # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf
 
 # End configuration section.
 
@@ -109,11 +112,11 @@ else
 fi
 
 if ! $skip_scoring ; then
-  err_msg="Not scoring because local/score.sh does not exist or not executable."
+  err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
   [ ! -x local/score.sh ] && echo $err_msg && exit 1;
   local/score.sh --cmd "$cmd" $data $oldlang $outdir
 else
-  echo "Not scoring because requested so..."
+  echo "$0: Not scoring because --skip-scoring was specified."
 fi
 
 exit 0;