-
Notifications
You must be signed in to change notification settings - Fork 5.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
nnet3-rnnlm lattice rescoring draft #1906
Changes from 7 commits
0d839b0
699c956
ef09b62
390a1bb
dc49709
8a33e77
5965b87
483450d
00912f7
b1167a2
a52da29
3bdaa4d
2b08335
7cf4af8
8f35242
705ecc8
d19ecc1
232ef04
bd9936b
9cc7ba1
267177f
87f2f6c
c9bf5e0
091d4d5
a192ada
697f219
acb5211
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson | ||
# 2015 Guoguo Chen | ||
# 2017 Hainan Xu | ||
|
||
# This script trains LMs on the swbd LM-training data. | ||
# This script takes no command-line arguments but takes the --cmd option. | ||
|
||
# Begin configuration section. | ||
cmd=run.pl | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not right. You should be sourcing cmd.sh and using e.g. train_cmd from there. |
||
dir=exp/rnnlm_lstm_d | ||
embedding_dim=800 | ||
lstm_rpd=200 | ||
lstm_nrpd=200 | ||
stage=-10 | ||
train_stage=-10 | ||
|
||
. utils/parse_options.sh | ||
|
||
text=data/train/text | ||
lexicon=data/local/dict_nosp/lexiconp.txt | ||
text_dir=data/rnnlm/text_nosp | ||
mkdir -p $dir/config | ||
set -e | ||
|
||
for f in $text $lexicon; do | ||
[ ! -f $f ] && \ | ||
echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1 | ||
done | ||
|
||
if [ $stage -le 0 ]; then | ||
mkdir -p $text_dir | ||
echo -n >$text_dir/dev.txt | ||
# hold out one in every 500 lines as dev data. | ||
cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt | ||
fi | ||
|
||
if [ $stage -le 1 ]; then | ||
# the training scripts require that <s>, </s> and <brk> be present in a particular | ||
# order. | ||
awk '{print $1}' $lexicon | sort | uniq | \ | ||
awk 'BEGIN{print "<eps> 0";print "<s> 1"; print "</s> 2"; print "<brk> 3";n=4;} {print $1, n++}' \ | ||
>$dir/config/words.txt | ||
# words that are not present in words.txt but are in the training or dev data, will be | ||
# mapped to <SPOKEN_NOISE> during training. | ||
echo "<unk>" >$dir/config/oov.txt | ||
|
||
cat > $dir/config/data_weights.txt <<EOF | ||
swbd 1 1.0 | ||
EOF | ||
|
||
rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ | ||
--unk-word="<unk>" \ | ||
--data-weights-file=$dir/config/data_weights.txt \ | ||
$text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt | ||
|
||
# choose features | ||
rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ | ||
--use-constant-feature=true \ | ||
--special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \ | ||
$dir/config/words.txt > $dir/config/features.txt | ||
|
||
cat >$dir/config/xconfig <<EOF | ||
input dim=$embedding_dim name=input | ||
relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1)) | ||
fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd | ||
relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3)) | ||
fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd | ||
relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3)) | ||
output-layer name=output include-log-softmax=false dim=$embedding_dim | ||
EOF | ||
rnnlm/validate_config_dir.sh $text_dir $dir/config | ||
fi | ||
|
||
if [ $stage -le 2 ]; then | ||
# the --unigram-factor option is set larger than the default (100) | ||
# in order to reduce the size of the sampling LM, because rnnlm-get-egs | ||
# was taking up too much CPU (as much as 10 cores). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you check if this is still the case (i.e. without this is it still taking up too many cores?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just ran this with the default 100 and it's not using more than one core. Deleting the comment. Should I change the 200 to the default 100? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can remove the option. |
||
rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200.0 \ | ||
$text_dir $dir/config $dir | ||
fi | ||
|
||
if [ $stage -le 3 ]; then | ||
rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \ | ||
--stage $train_stage --num-epochs 10 --cmd "queue.pl" $dir | ||
fi | ||
|
||
exit 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#!/bin/bash | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure that I like the fact that the training script and rescoring script are separate. |
||
n=50 | ||
ngram_order=4 | ||
rnndir= | ||
id=rnn | ||
|
||
. ./utils/parse_options.sh | ||
. ./cmd.sh | ||
. ./path.sh | ||
|
||
set -e | ||
|
||
LM=fsh_sw1_tg | ||
rnndir=exp/rnnlm_lstm_d | ||
|
||
|
||
for decode_set in eval2000; do | ||
dir=exp/chain/tdnn_lstm_1e_sp | ||
decode_dir=${dir}/decode_${decode_set}_$LM | ||
|
||
# Lattice rescoring | ||
steps/lmrescore_rnnlm_lat.sh \ | ||
--cmd "$decode_cmd --mem 16G" \ | ||
--rnnlm-ver kaldirnnlm --weight 0.5 --max-ngram-order $ngram_order \ | ||
data/lang_$LM $rnndir \ | ||
data/${decode_set}_hires ${decode_dir} \ | ||
${decode_dir}.nnet3rnnlm.lat.${ngram_order}gram | ||
|
||
done | ||
|
||
wait |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,7 @@ data=$1 | |
|
||
if [ -f $data/stm ]; then # use sclite scoring. | ||
echo "$data/stm exists: using local/score_sclite.sh" | ||
eval local/score_sclite.sh $orig_args | ||
eval local/score_sclite.sh "$orig_args" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just noticed that all of these changes are in swbd/s5. This is super outdated. You should be using s5c. I doubt that this problem (if there was a problem) occurs in the latest script. In any case let me know what the problem was, because I'd be surprised if this was really a bug, this script being so old. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC if I don't add the "", if the $orig_args has something like --cmd "queue.pl --mem 8G" it'll complain. |
||
else | ||
echo "$data/stm does not exist: using local/score_basic.sh" | ||
eval local/score_basic.sh $orig_args | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../scripts/rnnlm/ |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,11 @@ if [ "$rnnlm_ver" == "tensorflow" ]; then | |
first_arg="$rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final" | ||
fi | ||
|
||
if [ "$rnnlm_ver" == "kaldirnnlm" ]; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it might be better at this point to have a separate script for this type of lattice rescoring, located in scripts/rnnlm/. That will keep things separate and will make it easier to refactor in future. |
||
rescoring_binary="lattice-lmrescore-kaldi-rnnlm" | ||
first_arg="\"rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|\" $rnnlm_dir/config/words.txt " | ||
fi | ||
|
||
oldlm=$oldlang/G.fst | ||
if [ -f $oldlang/G.carpa ]; then | ||
oldlm=$oldlang/G.carpa | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -65,16 +65,19 @@ num_splits=$(cat $dir/text/info/num_splits) | |
num_repeats=$(cat $dir/text/info/num_repeats) | ||
text_files=$(for n in $(seq $num_splits); do echo $dir/text/$n.txt; done) | ||
vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}') | ||
embedding_type= | ||
|
||
if [ -f $dir/feat_embedding.0.mat ]; then | ||
sparse_features=true | ||
embedding_type=feat_embedding | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's just make this either "feat" or "word", remove the "_embedding". |
||
if [ -f $dir/word_embedding.0.mat ]; then | ||
echo "$0: error: $dir/feat_embedding.0.mat and $dir/word_embedding.0.mat both exist." | ||
exit 1; | ||
fi | ||
! [ -f $dir/word_feats.txt ] && echo "$0: expected $0/word_feats.txt to exist" && exit 1; | ||
else | ||
sparse_features=false | ||
embedding_type=word_embedding | ||
! [ -f $dir/word_embedding.0.mat ] && \ | ||
echo "$0: expected $dir/word_embedding.0.mat to exist" && exit 1 | ||
fi | ||
|
@@ -193,7 +196,7 @@ while [ $x -lt $num_iters ]; do | |
[ -f $dir/.train_error ] && \ | ||
echo "$0: failure on iteration $x of training, see $dir/log/train.$x.*.log for details." && exit 1 | ||
if [ $this_num_jobs -gt 1 ]; then | ||
# average the models and the embedding matrces. Use run.pl as we don't | ||
# average the models and the embedding matrces. Use run.pl as we don\'t | ||
# want this to wait on the queue (if there is a queue). | ||
src_models=$(for n in $(seq $this_num_jobs); do echo $dir/$[x+1].$n.raw; done) | ||
src_matrices=$(for n in $(seq $this_num_jobs); do echo $dir/${embedding_type}.$[x+1].$n.mat; done) | ||
|
@@ -219,8 +222,11 @@ if [ $stage -le $num_iters ]; then | |
echo "$0: best iteration (out of $num_iters) was $best_iter, linking it to final iteration." | ||
ln -sf $embedding_type.$best_iter.mat $dir/$embedding_type.final.mat | ||
ln -sf $best_iter.raw $dir/final.raw | ||
ln -sf $best_iter.raw $dir/rnnlm # to make it consistent with other RNNLMs | ||
fi | ||
|
||
touch $dir/unk.probs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. once we modify this setup to have its own rescoring scripts, unk.probs may no longer be needed. but I may merge this as-is for now. |
||
|
||
# Now get some diagnostics about the evolution of the objective function. | ||
if [ $stage -le $[num_iters+1] ]; then | ||
( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,9 @@ all: | |
EXTRA_CXXFLAGS = -Wno-sign-compare | ||
include ../kaldi.mk | ||
|
||
LDFLAGS += $(CUDA_LDFLAGS) | ||
LDLIBS += $(CUDA_LDLIBS) | ||
|
||
BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ | ||
lattice-lmrescore lattice-scale lattice-union lattice-to-post \ | ||
lattice-determinize lattice-oracle lattice-rmali \ | ||
|
@@ -21,18 +24,20 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ | |
lattice-confidence lattice-determinize-phone-pruned \ | ||
lattice-determinize-phone-pruned-parallel lattice-expand-ngram \ | ||
lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ | ||
lattice-arc-post lattice-determinize-non-compact \ | ||
lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the "latbin" directory now depends on nnet3 and cudamatrix, you will have to update the directory-level dependencies in ../Makefile. You could also see what misc/maintenance/reorder_addlibs.sh and misc/maintenance/find_missing_dependencies.sh do-- it may not be necessary to do this manually. |
||
lattice-lmrescore-pruned | ||
|
||
OBJFILES = | ||
|
||
cuda-compiled.o: ../kaldi.mk | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know where this line came from but it shouldn't be here. |
||
|
||
|
||
TESTFILES = | ||
|
||
ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \ | ||
../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \ | ||
../matrix/kaldi-matrix.a \ | ||
ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../lat/kaldi-lat.a ../nnet3/kaldi-nnet3.a ../lm/kaldi-lm.a \ | ||
../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \ | ||
../util/kaldi-util.a \ | ||
../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \ | ||
../base/kaldi-base.a | ||
|
||
include ../makefiles/default_rules.mk |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
// latbin/lattice-lmrescore-kaldi-rnnlm.cc | ||
|
||
// Copyright 2017 Johns Hopkins University (author: Daniel Povey) | ||
// Hainan Xu | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should have date on each line, e.g. |
||
// Yiming Wang | ||
|
||
// See ../../COPYING for clarification regarding multiple authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED | ||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, | ||
// MERCHANTABLITY OR NON-INFRINGEMENT. | ||
// See the Apache 2 License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
|
||
#include "base/kaldi-common.h" | ||
#include "fstext/fstext-lib.h" | ||
#include "lat/kaldi-lattice.h" | ||
#include "lat/lattice-functions.h" | ||
#include "rnnlm/rnnlm-lattice-rescoring.h" | ||
#include "util/common-utils.h" | ||
#include "nnet3/nnet-utils.h" | ||
|
||
int main(int argc, char *argv[]) { | ||
try { | ||
using namespace kaldi; | ||
typedef kaldi::int32 int32; | ||
typedef kaldi::int64 int64; | ||
|
||
const char *usage = | ||
"Rescores lattice with rnnlm. The LM will be wrapped into the\n" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This documentation is too code-level. Remove most of this text, and instead just mention the script in steps/ that calls it. |
||
"DeterministicOnDemandFst interface and the rescoring is done by\n" | ||
"composing with the wrapped LM using a special type of composition\n" | ||
"algorithm. Determinization will be applied on the composed lattice.\n" | ||
"\n" | ||
"Usage: lattice-lmrescore-kaldi-rnnlm [options] <embedding-file> <rnnlm-wordlist> \\\n" | ||
" <word-symbol-table-rxfilename> <lattice-rspecifier> \\\n" | ||
" <raw-rnnlm-rxfilename> <lattice-wspecifier>\n" | ||
" e.g.: lattice-lmrescore-kaldi-rnnlm --lm-scale=-1.0 word_embedding.mat \\\n" | ||
" rnn_words.txt fst_words.txt ark:in.lats rnnlm ark:out.lats\n"; | ||
|
||
ParseOptions po(usage); | ||
int32 max_ngram_order = 3; | ||
BaseFloat lm_scale = 1.0; | ||
|
||
po.Register("lm-scale", &lm_scale, "Scaling factor for language model " | ||
"costs; frequently 1.0 or -1.0"); | ||
po.Register("max-ngram-order", &max_ngram_order, "If positive, limit the " | ||
"rnnlm context to the given number, -1 means we are not going " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not quite accurate, we don't limit, we identify contexts that are equal in the n most recent words. |
||
"to limit it."); | ||
|
||
po.Read(argc, argv); | ||
|
||
if (po.NumArgs() != 6) { | ||
po.PrintUsage(); | ||
exit(1); | ||
} | ||
|
||
std::string lats_rspecifier, rnn_wordlist, word_embedding_rxfilename, | ||
word_symbols_rxfilename, rnnlm_rxfilename, lats_wspecifier; | ||
|
||
word_embedding_rxfilename = po.GetArg(1); | ||
rnn_wordlist = po.GetArg(2); | ||
word_symbols_rxfilename = po.GetArg(3); | ||
lats_rspecifier = po.GetArg(4); | ||
rnnlm_rxfilename = po.GetArg(5); | ||
lats_wspecifier = po.GetArg(6); | ||
|
||
// Reads the language model. | ||
kaldi::nnet3::Nnet rnnlm; | ||
ReadKaldiObject(rnnlm_rxfilename, &rnnlm); | ||
|
||
if (!IsSimpleNnet(rnnlm)) | ||
KALDI_ERR << "Input RNNLM in " << rnnlm_rxfilename | ||
<< " is not the type of neural net we were looking for; " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your Jedi mind tricks won't work on me. |
||
"failed IsSimpleNnet()."; | ||
|
||
CuMatrix<BaseFloat> word_embedding_mat; | ||
ReadKaldiObject(word_embedding_rxfilename, &word_embedding_mat); | ||
|
||
const nnet3::DecodableRnnlmSimpleLoopedComputationOptions opts; | ||
const nnet3::DecodableRnnlmSimpleLoopedInfo info(opts, rnnlm, word_embedding_mat); | ||
|
||
// Reads and writes as compact lattice. | ||
SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); | ||
CompactLatticeWriter compact_lattice_writer(lats_wspecifier); | ||
|
||
int32 n_done = 0, n_fail = 0; | ||
for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { | ||
std::string key = compact_lattice_reader.Key(); | ||
CompactLattice clat = compact_lattice_reader.Value(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Due to changes in how the reader classes are implemented, you can now make 'clat' a reference, and delete the line that says FreeCurrent(). You can do the same in wherever you got this code from, as well. |
||
compact_lattice_reader.FreeCurrent(); | ||
|
||
if (lm_scale != 0.0) { | ||
// Before composing with the LM FST, we scale the lattice weights | ||
// by the inverse of "lm_scale". We'll later scale by "lm_scale". | ||
// We do it this way so we can determinize and it will give the | ||
// right effect (taking the "best path" through the LM) regardless | ||
// of the sign of lm_scale. | ||
fst::ScaleLattice(fst::GraphLatticeScale(1.0 / lm_scale), &clat); | ||
ArcSort(&clat, fst::OLabelCompare<CompactLatticeArc>()); | ||
|
||
// Wraps the rnnlm into FST. We re-create it for each lattice to prevent | ||
// memory usage increasing with time. | ||
nnet3::KaldiRnnlmDeterministicFst rnnlm_fst(max_ngram_order, | ||
rnn_wordlist, | ||
word_symbols_rxfilename, | ||
info); | ||
|
||
// Composes lattice with language model. | ||
CompactLattice composed_clat; | ||
ComposeCompactLatticeDeterministic(clat, &rnnlm_fst, &composed_clat); | ||
|
||
// Determinizes the composed lattice. | ||
Lattice composed_lat; | ||
ConvertLattice(composed_clat, &composed_lat); | ||
Invert(&composed_lat); | ||
CompactLattice determinized_clat; | ||
DeterminizeLattice(composed_lat, &determinized_clat); | ||
fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat); | ||
if (determinized_clat.Start() == fst::kNoStateId) { | ||
KALDI_WARN << "Empty lattice for utterance " << key | ||
<< " (incompatible LM?)"; | ||
n_fail++; | ||
} else { | ||
compact_lattice_writer.Write(key, determinized_clat); | ||
n_done++; | ||
} | ||
} else { | ||
// Zero scale so nothing to do. | ||
n_done++; | ||
compact_lattice_writer.Write(key, clat); | ||
} | ||
} | ||
|
||
KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail; | ||
return (n_done != 0 ? 0 : 1); | ||
} catch(const std::exception &e) { | ||
std::cerr << e.what(); | ||
return -1; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you should introduce the 'tuning' directory concept and introduce soft links to the best current model. Otherwise this won't scale well.