-
Notifications
You must be signed in to change notification settings - Fork 5.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
nnet3-rnnlm lattice rescoring draft #1906
Changes from 1 commit
0d839b0
699c956
ef09b62
390a1bb
dc49709
8a33e77
5965b87
483450d
00912f7
b1167a2
a52da29
3bdaa4d
2b08335
7cf4af8
8f35242
705ecc8
d19ecc1
232ef04
bd9936b
9cc7ba1
267177f
87f2f6c
c9bf5e0
091d4d5
a192ada
697f219
acb5211
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2015 Guoguo Chen | ||
# 2017 Hainan Xu | ||
# Apache 2.0 | ||
|
||
# This script rescores lattices with RNNLM. See also rnnlmrescore.sh which is | ||
# an older script using n-best lists. | ||
|
||
# Begin configuration section. | ||
cmd=run.pl | ||
skip_scoring=false | ||
max_ngram_order=4 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Clarify via a comment that this is for the n-gram approximation in lattice rescoring, and if feasible, refer to some kind of paper that explains this concept. |
||
N=10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume this N variable is no longer needed. |
||
inv_acwt=12 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we normally make this default to 10. making the default 12 would be confusing. |
||
weight=1.0 # Interpolation weight for RNNLM. | ||
# End configuration section. | ||
rnnlm_ver= | ||
#layer_string= | ||
|
||
echo "$0 $@" # Print the command line for logging | ||
|
||
. ./utils/parse_options.sh | ||
|
||
if [ $# != 5 ]; then | ||
echo "Does language model rescoring of lattices (remove old LM, add new LM)" | ||
echo "with RNNLM." | ||
echo "" | ||
echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\" | ||
echo " <data-dir> <input-decode-dir> <output-decode-dir>" | ||
echo " e.g.: $0 ./rnnlm data/lang_tg data/test \\" | ||
echo " exp/tri3/test_tg exp/tri3/test_rnnlm" | ||
echo "options: [--cmd (run.pl|queue.pl [queue opts])]" | ||
exit 1; | ||
fi | ||
|
||
[ -f path.sh ] && . ./path.sh; | ||
|
||
oldlang=$1 | ||
rnnlm_dir=$2 | ||
data=$3 | ||
indir=$4 | ||
outdir=$5 | ||
|
||
rescoring_binary=lattice-lmrescore-rnnlm | ||
|
||
|
||
if [ "$rnnlm_ver" == "kaldirnnlm" ]; then | ||
rescoring_binary="lattice-lmrescore-kaldi-rnnlm" | ||
first_arg="\"rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|\" $rnnlm_dir/config/words.txt " | ||
fi | ||
|
||
oldlm=$oldlang/G.fst | ||
if [ -f $oldlang/G.carpa ]; then | ||
oldlm=$oldlang/G.carpa | ||
elif [ ! -f $oldlm ]; then | ||
echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\ | ||
exit 1; | ||
fi | ||
|
||
[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; | ||
[ ! -f $rnnlm_dir/final.raw ] && echo "$0: Missing file $rnnlm_dir/final.raw" && exit 1; | ||
[ ! -f $rnnlm_dir/feat_embedding.final.mat ] && [ ! -f $rnnlm_dir/word_embedding.final.mat ] && echo "$0: Missing word embedding file" && exit 1; | ||
|
||
[ ! -f $oldlang/words.txt ] &&\ | ||
echo "$0: Missing file $oldlang/words.txt" && exit 1; | ||
! ls $indir/lat.*.gz >/dev/null &&\ | ||
echo "$0: No lattices input directory $indir" && exit 1; | ||
awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { | ||
print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ | ||
|| exit 1; | ||
|
||
oldlm_command="fstproject --project_output=true $oldlm |" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this projection is no longer necessary because IIRC the code does the projection internally. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, this is not the pruned version of lattice-rescoring and it uses the old latbin binaries. |
||
|
||
acwt=`perl -e "print (1.0/$inv_acwt);"` | ||
|
||
word_embedding= | ||
if [ -f $rnnlm_dir/word_embedding.final.mat ]; then | ||
word_embedding=$rnnlm_dir/word_embedding.final.mat | ||
else | ||
word_embedding="\"rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|\"" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To avoid the need for escaping, I suggest using single quotes for the inner quotes. |
||
fi | ||
|
||
mkdir -p $outdir/log | ||
nj=`cat $indir/num_jobs` || exit 1; | ||
cp $indir/num_jobs $outdir | ||
|
||
oldlm_weight=`perl -e "print -1.0 * $weight;"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I generally prefer |
||
if [ "$oldlm" == "$oldlang/G.fst" ]; then | ||
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ | ||
lattice-lmrescore --lm-scale=$oldlm_weight \ | ||
"ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \ | ||
$rescoring_binary --lm-scale=$weight \ | ||
--max-ngram-order=$max_ngram_order \ | ||
$oldlang/words.txt ark:- $rnnlm_dir/config/words.txt $word_embedding "$rnnlm_dir/final.raw" \ | ||
"ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; | ||
else | ||
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ | ||
lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \ | ||
"ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:- \| \ | ||
$rescoring_binary --lm-scale=$weight \ | ||
--max-ngram-order=$max_ngram_order \ | ||
$oldlang/words.txt ark:- $rnnlm_dir/config/words.txt $word_embedding "$rnnlm_dir/final.raw" \ | ||
"ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; | ||
fi | ||
if ! $skip_scoring ; then | ||
err_msg="Not scoring because local/score.sh does not exist or not executable." | ||
[ ! -x local/score.sh ] && echo $err_msg && exit 1; | ||
local/score.sh --cmd "$cmd" $data $oldlang $outdir | ||
else | ||
echo "Not scoring because requested so..." | ||
fi | ||
|
||
exit 0; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
// rnnlm/rnnlm-lattice-rescoring.cc | ||
|
||
// Copyright 2017 Johns Hopkins University (author: Daniel Povey) | ||
// Yiming Wang | ||
// Hainan Xu | ||
// Copyright 2017 Johns Hopkins University (author: Daniel Povey) | ||
// 2017 Yiming Wang | ||
// 2017 Hainan Xu | ||
// | ||
// See ../../COPYING for clarification regarding multiple authors | ||
// | ||
|
@@ -64,7 +64,9 @@ void KaldiRnnlmDeterministicFst::ReadFstWordSymbolTableAndRnnWordlist( | |
int32 i = 0; | ||
while (ifile >> word >> id) { | ||
if (word == "</s>") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC, other parts of the code don't assume that these are the written forms of BOS and EOS; they take these as options. The scripts do, currently, so this might not be fatal-- but generally speaking, programs don't take any arguments with written forms of words unless they absolutely have to for some reason, and this program breaks that pattern. I'd be happier to add the same options as for the training tools, with --bos-symbol and --eos-symbol (and an ignored option, --brk-symbol, if it makes your life easier scripting-wise). |
||
final_word_index_ = id; | ||
eos_index_ = id; | ||
} else if (word == "<s>") { | ||
bos_index_ = id; | ||
} | ||
KALDI_ASSERT(i == id); | ||
i++; | ||
|
@@ -95,18 +97,18 @@ void KaldiRnnlmDeterministicFst::ReadFstWordSymbolTableAndRnnWordlist( | |
KaldiRnnlmDeterministicFst::KaldiRnnlmDeterministicFst(int32 max_ngram_order, | ||
const std::string &rnn_wordlist, | ||
const std::string &word_symbol_table_rxfilename, | ||
const DecodableRnnlmSimpleLoopedInfo &info) { | ||
const RnnlmSimpleLoopedInfo &info) { | ||
max_ngram_order_ = max_ngram_order; | ||
ReadFstWordSymbolTableAndRnnWordlist(rnn_wordlist, | ||
word_symbol_table_rxfilename); | ||
|
||
std::vector<Label> bos; | ||
bos.push_back(0); // 0 for <s> | ||
state_to_wseq_.push_back(bos); | ||
DecodableRnnlmSimpleLooped decodable_rnnlm(info); | ||
decodable_rnnlm.TakeFeatures(std::vector<Label>(1, bos[0])); | ||
std::vector<Label> bos_seq; | ||
bos_seq.push_back(bos_index_); | ||
state_to_wseq_.push_back(bos_seq); | ||
RnnlmSimpleLooped decodable_rnnlm(info); | ||
decodable_rnnlm.TakeFeatures(bos_seq); | ||
state_to_decodable_rnnlm_.push_back(decodable_rnnlm); | ||
wseq_to_state_[bos] = 0; | ||
wseq_to_state_[bos_seq] = 0; | ||
start_state_ = 0; | ||
} | ||
|
||
|
@@ -115,7 +117,7 @@ fst::StdArc::Weight KaldiRnnlmDeterministicFst::Final(StateId s) { | |
KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size()); | ||
|
||
// log prob of end of sentence | ||
BaseFloat logprob = state_to_decodable_rnnlm_[s].GetOutput(0, final_word_index_); | ||
BaseFloat logprob = state_to_decodable_rnnlm_[s].GetOutput(0, eos_index_); | ||
return Weight(-logprob); | ||
} | ||
|
||
|
@@ -125,7 +127,7 @@ bool KaldiRnnlmDeterministicFst::GetArc(StateId s, Label ilabel, | |
KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size()); | ||
|
||
std::vector<Label> wseq = state_to_wseq_[s]; | ||
DecodableRnnlmSimpleLooped decodable_rnnlm = state_to_decodable_rnnlm_[s]; | ||
RnnlmSimpleLooped decodable_rnnlm = state_to_decodable_rnnlm_[s]; | ||
int32 rnn_word = fst_label_to_rnn_label_[ilabel]; | ||
|
||
BaseFloat logprob = decodable_rnnlm.GetOutput(0, rnn_word); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't like the 'rnnlm' in the name here because it's redundant-- it's already in a directory called 'rnnlm'. You could call it 'lmrescore_lat.sh', but IMO the name 'lmrescore.sh' would be more ideal, and you could then have the n-best one be called 'lmrescore_nbest.sh', if needed.