kaldi-asr · danpovey · Nov 23, 2017 · Sep 18, 2017 · Sep 20, 2017 · Sep 22, 2017
diff --git a/egs/swbd/s5/local/rnnlm/run_lstm_d.sh b/egs/swbd/s5/local/rnnlm/run_lstm_d.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+
+# This script trains LMs on the swbd LM-training data.
+# This script takes no command-line arguments but takes the --cmd option.
+
+# Begin configuration section.
+cmd=run.pl
+dir=exp/rnnlm_lstm_d
+embedding_dim=800
+lstm_rpd=200
+lstm_nrpd=200
+stage=-10
+train_stage=-10
+
+. utils/parse_options.sh
+
+text=data/train/text
+lexicon=data/local/dict_nosp/lexiconp.txt
+text_dir=data/rnnlm/text_nosp
+mkdir -p $dir/config
+set -e
+
+for f in $text $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 500 lines as dev data.
+  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt
+fi
+
+if [ $stage -le 1 ]; then
+  # the training scripts require that <s>, </s> and <brk> be present in a particular
+  # order.
+  awk '{print $1}' $lexicon | sort | uniq | \
+    awk 'BEGIN{print "<eps> 0";print "<s> 1"; print "</s> 2"; print "<brk> 3";n=4;} {print $1, n++}' \
+        >$dir/config/words.txt
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+swbd   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200.0 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \
+                  --stage $train_stage --num-epochs 10 --cmd "queue.pl" $dir
+fi
+
+exit 0
diff --git a/egs/swbd/s5/local/rnnlm/run_rescoring.sh b/egs/swbd/s5/local/rnnlm/run_rescoring.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+n=50
+ngram_order=4
+rnndir=
+id=rnn
+
+. ./utils/parse_options.sh
+. ./cmd.sh
+. ./path.sh
+
+set -e
+
+LM=fsh_sw1_tg
+rnndir=exp/rnnlm_lstm_d
+
+
+for decode_set in eval2000; do
+  dir=exp/chain/tdnn_lstm_1e_sp
+  decode_dir=${dir}/decode_${decode_set}_$LM
+
+  # Lattice rescoring
+  steps/lmrescore_rnnlm_lat.sh \
+    --cmd "$decode_cmd --mem 16G" \
+    --rnnlm-ver kaldirnnlm  --weight 0.5 --max-ngram-order $ngram_order \
+    data/lang_$LM $rnndir \
+    data/${decode_set}_hires ${decode_dir} \
+    ${decode_dir}.nnet3rnnlm.lat.${ngram_order}gram
+
+done
+
+wait
diff --git a/egs/swbd/s5/local/score.sh b/egs/swbd/s5/local/score.sh
@@ -31,7 +31,7 @@ data=$1
 
 if [ -f $data/stm ]; then # use sclite scoring.
   echo "$data/stm exists: using local/score_sclite.sh"
-  eval local/score_sclite.sh $orig_args
+  eval local/score_sclite.sh "$orig_args"
 else
   echo "$data/stm does not exist: using local/score_basic.sh"
   eval local/score_basic.sh $orig_args

diff --git a/egs/swbd/s5/rnnlm b/egs/swbd/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
@@ -61,6 +61,11 @@ if [ "$rnnlm_ver" == "tensorflow" ]; then
   first_arg="$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final"
 fi
 
+if [ "$rnnlm_ver" == "kaldirnnlm" ]; then
+  rescoring_binary="lattice-lmrescore-kaldi-rnnlm"
+  first_arg="\"rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|\" $rnnlm_dir/config/words.txt "
+fi
+
 oldlm=$oldlang/G.fst
 if [ -f $oldlang/G.carpa ]; then
   oldlm=$oldlang/G.carpa

diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh
@@ -65,16 +65,19 @@ num_splits=$(cat $dir/text/info/num_splits)
 num_repeats=$(cat $dir/text/info/num_repeats)
 text_files=$(for n in $(seq $num_splits); do echo $dir/text/$n.txt; done)
 vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}')
+embedding_type=
 
 if [ -f $dir/feat_embedding.0.mat ]; then
   sparse_features=true
+  embedding_type=feat_embedding
   if [ -f $dir/word_embedding.0.mat ]; then
     echo "$0: error: $dir/feat_embedding.0.mat and $dir/word_embedding.0.mat both exist."
     exit 1;
   fi
   ! [ -f $dir/word_feats.txt ] && echo "$0: expected $0/word_feats.txt to exist" && exit 1;
 else
   sparse_features=false
+  embedding_type=word_embedding
   ! [ -f $dir/word_embedding.0.mat ] && \
     echo "$0: expected $dir/word_embedding.0.mat to exist" && exit 1
 fi
@@ -193,7 +196,7 @@ while [ $x -lt $num_iters ]; do
       [ -f $dir/.train_error ] && \
         echo "$0: failure on iteration $x of training, see $dir/log/train.$x.*.log for details." && exit 1
       if [ $this_num_jobs -gt 1 ]; then
-        # average the models and the embedding matrces.  Use run.pl as we don't
+        # average the models and the embedding matrces.  Use run.pl as we don\'t
         # want this to wait on the queue (if there is a queue).
         src_models=$(for n in $(seq $this_num_jobs); do echo $dir/$[x+1].$n.raw; done)
         src_matrices=$(for n in $(seq $this_num_jobs); do echo $dir/${embedding_type}.$[x+1].$n.mat; done)
@@ -219,8 +222,11 @@ if [ $stage -le $num_iters ]; then
   echo "$0: best iteration (out of $num_iters) was $best_iter, linking it to final iteration."
   ln -sf $embedding_type.$best_iter.mat $dir/$embedding_type.final.mat
   ln -sf $best_iter.raw $dir/final.raw
+  ln -sf $best_iter.raw $dir/rnnlm  # to make it consistent with other RNNLMs
 fi
 
+touch $dir/unk.probs
+
 # Now get some diagnostics about the evolution of the objective function.
 if [ $stage -le $[num_iters+1] ]; then
   (

diff --git a/src/latbin/Makefile b/src/latbin/Makefile
@@ -4,6 +4,9 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
 BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-lmrescore lattice-scale lattice-union lattice-to-post \
            lattice-determinize lattice-oracle lattice-rmali \
@@ -21,18 +24,20 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-confidence lattice-determinize-phone-pruned \
            lattice-determinize-phone-pruned-parallel lattice-expand-ngram \
            lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \
-           lattice-arc-post lattice-determinize-non-compact \
+           lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \
            lattice-lmrescore-pruned
 
 OBJFILES =
 
+cuda-compiled.o: ../kaldi.mk
 
 
 TESTFILES =
 
-ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
-          ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
+ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../lat/kaldi-lat.a ../nnet3/kaldi-nnet3.a ../lm/kaldi-lm.a \
+          ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
+          ../util/kaldi-util.a \
+          ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
           ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/latbin/lattice-lmrescore-kaldi-rnnlm.cc b/src/latbin/lattice-lmrescore-kaldi-rnnlm.cc
@@ -0,0 +1,149 @@
+// latbin/lattice-lmrescore-kaldi-rnnlm.cc
+
+// Copyright 2017 Johns Hopkins University (author: Daniel Povey)
+//                Hainan Xu
+//                Yiming Wang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "rnnlm/rnnlm-lattice-rescoring.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Rescores lattice with rnnlm. The LM will be wrapped into the\n"
+        "DeterministicOnDemandFst interface and the rescoring is done by\n"
+        "composing with the wrapped LM using a special type of composition\n"
+        "algorithm. Determinization will be applied on the composed lattice.\n"
+        "\n"
+        "Usage: lattice-lmrescore-kaldi-rnnlm [options] <embedding-file> <rnnlm-wordlist> \\\n"
+        "             <word-symbol-table-rxfilename> <lattice-rspecifier> \\\n"
+        "             <raw-rnnlm-rxfilename> <lattice-wspecifier>\n"
+        " e.g.: lattice-lmrescore-kaldi-rnnlm --lm-scale=-1.0 word_embedding.mat \\\n"
+        "       rnn_words.txt fst_words.txt ark:in.lats rnnlm ark:out.lats\n";
+
+    ParseOptions po(usage);
+    int32 max_ngram_order = 3;
+    BaseFloat lm_scale = 1.0;
+
+    po.Register("lm-scale", &lm_scale, "Scaling factor for language model "
+                "costs; frequently 1.0 or -1.0");
+    po.Register("max-ngram-order", &max_ngram_order, "If positive, limit the "
+                "rnnlm context to the given number, -1 means we are not going "
+                "to limit it.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string lats_rspecifier, rnn_wordlist, word_embedding_rxfilename,
+        word_symbols_rxfilename, rnnlm_rxfilename, lats_wspecifier;
+
+    word_embedding_rxfilename = po.GetArg(1);
+    rnn_wordlist = po.GetArg(2);
+    word_symbols_rxfilename = po.GetArg(3);
+    lats_rspecifier = po.GetArg(4);
+    rnnlm_rxfilename = po.GetArg(5);
+    lats_wspecifier = po.GetArg(6);
+
+    // Reads the language model.
+    kaldi::nnet3::Nnet rnnlm;
+    ReadKaldiObject(rnnlm_rxfilename, &rnnlm);
+
+    if (!IsSimpleNnet(rnnlm))
+      KALDI_ERR << "Input RNNLM in " << rnnlm_rxfilename
+                << " is not the type of neural net we were looking for; "
+          "failed IsSimpleNnet().";
+
+    CuMatrix<BaseFloat> word_embedding_mat;
+    ReadKaldiObject(word_embedding_rxfilename, &word_embedding_mat);
+
+    const nnet3::DecodableRnnlmSimpleLoopedComputationOptions opts;
+    const nnet3::DecodableRnnlmSimpleLoopedInfo info(opts, rnnlm, word_embedding_mat);
+
+    // Reads and writes as compact lattice.
+    SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
+
+    int32 n_done = 0, n_fail = 0;
+    for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
+      std::string key = compact_lattice_reader.Key();
+      CompactLattice clat = compact_lattice_reader.Value();
+      compact_lattice_reader.FreeCurrent();
+
+      if (lm_scale != 0.0) {
+        // Before composing with the LM FST, we scale the lattice weights
+        // by the inverse of "lm_scale".  We'll later scale by "lm_scale".
+        // We do it this way so we can determinize and it will give the
+        // right effect (taking the "best path" through the LM) regardless
+        // of the sign of lm_scale.
+        fst::ScaleLattice(fst::GraphLatticeScale(1.0 / lm_scale), &clat);
+        ArcSort(&clat, fst::OLabelCompare<CompactLatticeArc>());
+
+        // Wraps the rnnlm into FST. We re-create it for each lattice to prevent
+        // memory usage increasing with time.
+        nnet3::KaldiRnnlmDeterministicFst rnnlm_fst(max_ngram_order,
+                                                    rnn_wordlist,
+                                                    word_symbols_rxfilename,
+                                                    info);
+
+        // Composes lattice with language model.
+        CompactLattice composed_clat;
+        ComposeCompactLatticeDeterministic(clat, &rnnlm_fst, &composed_clat);
+
+        // Determinizes the composed lattice.
+        Lattice composed_lat;
+        ConvertLattice(composed_clat, &composed_lat);
+        Invert(&composed_lat);
+        CompactLattice determinized_clat;
+        DeterminizeLattice(composed_lat, &determinized_clat);
+        fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat);
+        if (determinized_clat.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty lattice for utterance " << key
+              << " (incompatible LM?)";
+          n_fail++;
+        } else {
+          compact_lattice_writer.Write(key, determinized_clat);
+          n_done++;
+        }
+      } else {
+        // Zero scale so nothing to do.
+        n_done++;
+        compact_lattice_writer.Write(key, clat);
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail;
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/rnnlm/Makefile b/src/rnnlm/Makefile
@@ -10,7 +10,7 @@ TESTFILES = sampler-test sampling-lm-test rnnlm-example-test
 OBJFILES = sampler.o rnnlm-example.o rnnlm-example-utils.o \
            rnnlm-core-training.o rnnlm-embedding-training.o rnnlm-core-compute.o \
            rnnlm-utils.o rnnlm-training.o rnnlm-test-utils.o sampling-lm-estimate.o \
-           sampling-lm.o
+           sampling-lm.o rnnlm-decodable-simple-looped.o rnnlm-lattice-rescoring.o
 
 LIBNAME = kaldi-rnnlm