-
Notifications
You must be signed in to change notification settings - Fork 5.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
nnet3-rnnlm lattice rescoring draft #1906
Changes from 25 commits
0d839b0
699c956
ef09b62
390a1bb
dc49709
8a33e77
5965b87
483450d
00912f7
b1167a2
a52da29
3bdaa4d
2b08335
7cf4af8
8f35242
705ecc8
d19ecc1
232ef04
bd9936b
9cc7ba1
267177f
87f2f6c
c9bf5e0
091d4d5
a192ada
697f219
acb5211
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../scripts/rnnlm/ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) | ||
# 2015 Guoguo Chen | ||
# 2017 Hainan Xu | ||
|
||
# This script trains LMs on the swbd LM-training data. | ||
|
||
# Train objf: -299.20 -4.42 -4.24 -4.16 -4.10 -4.06 -4.03 -4.01 -3.98 -3.95 -3.94 -3.92 -3.90 -3.88 -3.86 -3.85 -3.84 -3.82 -3.81 -3.81 -3.79 -3.78 -3.77 -3.76 -3.74 | ||
# Dev objf: -10.65 -4.67 -4.37 -4.25 -4.19 -4.14 -4.10 -4.07 -4.03 -4.00 -3.99 -3.98 -3.96 -3.95 -3.93 -3.93 -3.91 -3.91 -3.90 -3.90 -3.88 -3.88 -3.87 -3.87 -3.86 | ||
|
||
# Begin configuration section. | ||
|
||
dir=exp/rnnlm_lstm_1a | ||
embedding_dim=2048 | ||
lstm_rpd=512 | ||
lstm_nrpd=512 | ||
stage=-10 | ||
train_stage=-10 | ||
|
||
# variables for lattice rescoring | ||
run_rescore=false | ||
ac_model_dir=exp/chain/tdnn_lstm_1e_sp | ||
decode_dir_suffix=rnnlm | ||
ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order | ||
# if it's set, it merges histories in the lattice if they share | ||
# the same ngram history and this prevents the lattice from | ||
# exploding exponentially | ||
|
||
. cmd.sh | ||
. utils/parse_options.sh | ||
|
||
text=data/train/text | ||
lexicon=data/local/dict_nosp/lexiconp.txt | ||
text_dir=data/rnnlm/text_nosp | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Our normal practice for Swbd is to include Fisher data as part of the LM training data. Is there are reason you are not doing this here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. data/train/text already includes both swbd and fisher data. It must have been done in previous data-preparation steps. |
||
mkdir -p $dir/config | ||
set -e | ||
|
||
for f in $text $lexicon; do | ||
[ ! -f $f ] && \ | ||
echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1 | ||
done | ||
|
||
if [ $stage -le 0 ]; then | ||
mkdir -p $text_dir | ||
echo -n >$text_dir/dev.txt | ||
# hold out one in every 500 lines as dev data. | ||
cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt | ||
fi | ||
|
||
if [ $stage -le 1 ]; then | ||
cp data/lang/words.txt $dir/config/ | ||
n=`cat $dir/config/words.txt | wc -l` | ||
echo "<brk> $n" >> $dir/config/words.txt | ||
|
||
# words that are not present in words.txt but are in the training or dev data, will be | ||
# mapped to <SPOKEN_NOISE> during training. | ||
echo "<unk>" >$dir/config/oov.txt | ||
|
||
cat > $dir/config/data_weights.txt <<EOF | ||
swbd 1 1.0 | ||
EOF | ||
|
||
rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ | ||
--unk-word="<unk>" \ | ||
--data-weights-file=$dir/config/data_weights.txt \ | ||
$text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt | ||
|
||
# choose features | ||
rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ | ||
--use-constant-feature=true \ | ||
--special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \ | ||
$dir/config/words.txt > $dir/config/features.txt | ||
|
||
cat >$dir/config/xconfig <<EOF | ||
input dim=$embedding_dim name=input | ||
relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1)) | ||
fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd | ||
relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3)) | ||
fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd | ||
relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3)) | ||
output-layer name=output include-log-softmax=false dim=$embedding_dim | ||
EOF | ||
rnnlm/validate_config_dir.sh $text_dir $dir/config | ||
fi | ||
|
||
if [ $stage -le 2 ]; then | ||
# the --unigram-factor option is set larger than the default (100) | ||
# in order to reduce the size of the sampling LM, because rnnlm-get-egs | ||
# was taking up too much CPU (as much as 10 cores). | ||
rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir | ||
fi | ||
|
||
if [ $stage -le 3 ]; then | ||
rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \ | ||
--stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir | ||
fi | ||
|
||
if [ $stage -le 4 ] && $run_rescore; then | ||
echo Perform lattice-rescoring on $ac_model_dir | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All echo statements should start with "$0: ". This means that things on the screen always have a clear source. |
||
LM=fsh_sw1_tg | ||
for decode_set in eval2000; do | ||
decode_dir=${ac_model_dir}/decode_${decode_set}_$LM | ||
|
||
# Lattice rescoring | ||
rnnlm/lmrescore_rnnlm_lat.sh \ | ||
--cmd "$decode_cmd --mem 4G" \ | ||
--weight 0.5 --max-ngram-order $ngram_order \ | ||
data/lang_$LM $dir \ | ||
data/${decode_set}_hires ${decode_dir} \ | ||
${decode_dir}_${decode_dir_suffix} | ||
done | ||
fi | ||
|
||
exit 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/bin/bash | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are no results here. You should create some kind of script to display and compare the results nicely, and put its output at the top of each of these scripts showing how it differs from any relevant baselines. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How important is it to have the results numbers? I notice that I might have deleted some of the recipes. |
||
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) | ||
# 2015 Guoguo Chen | ||
# 2017 Hainan Xu | ||
|
||
# This script trains LMs on the swbd LM-training data. | ||
|
||
# Train objf: -342.40 -4.48 -4.30 -4.21 -4.16 -4.12 -4.08 -4.07 -4.04 -4.00 -3.99 -3.97 -3.95 -3.94 -3.92 -3.91 -3.90 -3.89 -3.88 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 | ||
# Dev objf: -10.65 -4.72 -4.43 -4.31 -4.24 -4.18 -4.15 -4.12 -4.08 -4.06 -4.04 -4.02 -4.01 -3.99 -3.98 -3.97 -3.96 -3.95 -3.95 -3.94 -3.93 -3.92 -3.92 -3.91 -3.91 | ||
|
||
# Begin configuration section. | ||
dir=exp/rnnlm_lstm_1b | ||
embedding_dim=800 | ||
lstm_rpd=200 | ||
lstm_nrpd=200 | ||
stage=-10 | ||
train_stage=-10 | ||
|
||
# variables for lattice rescoring | ||
run_rescore=false | ||
ac_model_dir=exp/chain/tdnn_lstm_1e_sp | ||
decode_dir_suffix=rnnlm | ||
ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order | ||
# if it's set, it merges histories in the lattice if they share | ||
# the same ngram history and this prevents the lattice from | ||
# exploding exponentially | ||
|
||
. cmd.sh | ||
. utils/parse_options.sh | ||
|
||
text=data/train/text | ||
lexicon=data/local/dict_nosp/lexiconp.txt | ||
text_dir=data/rnnlm/text_nosp | ||
mkdir -p $dir/config | ||
set -e | ||
|
||
for f in $text $lexicon; do | ||
[ ! -f $f ] && \ | ||
echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1 | ||
done | ||
|
||
if [ $stage -le 0 ]; then | ||
mkdir -p $text_dir | ||
echo -n >$text_dir/dev.txt | ||
# hold out one in every 500 lines as dev data. | ||
cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt | ||
fi | ||
|
||
if [ $stage -le 1 ]; then | ||
cp data/lang/words.txt $dir/config/ | ||
n=`cat $dir/config/words.txt | wc -l` | ||
echo "<brk> $n" >> $dir/config/words.txt | ||
|
||
# words that are not present in words.txt but are in the training or dev data, will be | ||
# mapped to <SPOKEN_NOISE> during training. | ||
echo "<unk>" >$dir/config/oov.txt | ||
|
||
cat > $dir/config/data_weights.txt <<EOF | ||
swbd 1 1.0 | ||
EOF | ||
|
||
rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ | ||
--unk-word="<unk>" \ | ||
--data-weights-file=$dir/config/data_weights.txt \ | ||
$text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt | ||
|
||
# choose features | ||
rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ | ||
--use-constant-feature=true \ | ||
--special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \ | ||
$dir/config/words.txt > $dir/config/features.txt | ||
|
||
cat >$dir/config/xconfig <<EOF | ||
input dim=$embedding_dim name=input | ||
relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1)) | ||
fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd | ||
relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3)) | ||
fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd | ||
relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3)) | ||
output-layer name=output include-log-softmax=false dim=$embedding_dim | ||
EOF | ||
rnnlm/validate_config_dir.sh $text_dir $dir/config | ||
fi | ||
|
||
if [ $stage -le 2 ]; then | ||
rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir | ||
fi | ||
|
||
if [ $stage -le 3 ]; then | ||
rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \ | ||
--stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir | ||
fi | ||
|
||
if [ $stage -le 4 ] && $run_rescore; then | ||
echo Perform lattice-rescoring on $ac_model_dir | ||
LM=fsh_sw1_tg | ||
for decode_set in eval2000; do | ||
decode_dir=${ac_model_dir}/decode_${decode_set}_$LM | ||
|
||
# Lattice rescoring | ||
rnnlm/lmrescore_rnnlm_lat.sh \ | ||
--cmd "$decode_cmd --mem 4G" \ | ||
--weight 0.5 --max-ngram-order $ngram_order \ | ||
data/lang_$LM $dir \ | ||
data/${decode_set}_hires ${decode_dir} \ | ||
${decode_dir}_${decode_dir_suffix} | ||
done | ||
fi | ||
|
||
exit 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/bin/bash | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For each new tuning script I'd like to have a comment saying what it was modified from and comparing the results with that old script, so we have some idea what the performance difference is. Talk to @keli78, maybe she can help write a script to automate this comparison of directories. What would be really nice is if you could have a script in local/rnnlm/ that prints out the train and dev perplexities as well as the decoding results, but I don't know how feasible that would be. |
||
# Copyright 2012 Johns Hopkins University (author: Daniel Povey) | ||
# 2015 Guoguo Chen | ||
# 2017 Hainan Xu | ||
|
||
# This script trains LMs on the swbd LM-training data. | ||
|
||
# Train objf: -341.90 -4.45 -4.27 -4.19 -4.13 -4.09 -4.05 -4.04 -4.01 -3.98 -3.96 -3.95 -3.93 -3.91 -3.90 -3.89 -3.88 -3.86 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 | ||
# Dev objf: -10.65 -4.68 -4.40 -4.28 -4.21 -4.16 -4.13 -4.10 -4.07 -4.04 -4.02 -4.00 -3.99 -3.97 -3.96 -3.95 -3.94 -3.94 -3.92 -3.92 -3.91 -3.90 -3.90 -3.89 -3.89 | ||
|
||
# Begin configuration section. | ||
dir=exp/rnnlm_lstm_1c | ||
embedding_dim=1024 | ||
lstm_rpd=256 | ||
lstm_nrpd=256 | ||
stage=-10 | ||
train_stage=-10 | ||
|
||
# variables for lattice rescoring | ||
run_rescore=false | ||
ac_model_dir=exp/chain/tdnn_lstm_1e_sp | ||
decode_dir_suffix=rnnlm | ||
ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order | ||
# if it's set, it merges histories in the lattice if they share | ||
# the same ngram history and this prevents the lattice from | ||
# exploding exponentially | ||
|
||
. cmd.sh | ||
. utils/parse_options.sh | ||
|
||
text=data/train/text | ||
lexicon=data/local/dict_nosp/lexiconp.txt | ||
text_dir=data/rnnlm/text_nosp | ||
mkdir -p $dir/config | ||
set -e | ||
|
||
for f in $text $lexicon; do | ||
[ ! -f $f ] && \ | ||
echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1 | ||
done | ||
|
||
if [ $stage -le 0 ]; then | ||
mkdir -p $text_dir | ||
echo -n >$text_dir/dev.txt | ||
# hold out one in every 500 lines as dev data. | ||
cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt | ||
fi | ||
|
||
if [ $stage -le 1 ]; then | ||
cp data/lang/words.txt $dir/config/ | ||
n=`cat $dir/config/words.txt | wc -l` | ||
echo "<brk> $n" >> $dir/config/words.txt | ||
|
||
# words that are not present in words.txt but are in the training or dev data, will be | ||
# mapped to <SPOKEN_NOISE> during training. | ||
echo "<unk>" >$dir/config/oov.txt | ||
|
||
cat > $dir/config/data_weights.txt <<EOF | ||
swbd 1 1.0 | ||
EOF | ||
|
||
rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ | ||
--unk-word="<unk>" \ | ||
--data-weights-file=$dir/config/data_weights.txt \ | ||
$text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt | ||
|
||
# choose features | ||
rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ | ||
--use-constant-feature=true \ | ||
--special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \ | ||
$dir/config/words.txt > $dir/config/features.txt | ||
|
||
cat >$dir/config/xconfig <<EOF | ||
input dim=$embedding_dim name=input | ||
relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1)) | ||
fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd | ||
relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3)) | ||
fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd | ||
relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3)) | ||
output-layer name=output include-log-softmax=false dim=$embedding_dim | ||
EOF | ||
rnnlm/validate_config_dir.sh $text_dir $dir/config | ||
fi | ||
|
||
if [ $stage -le 2 ]; then | ||
rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir | ||
fi | ||
|
||
if [ $stage -le 3 ]; then | ||
rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \ | ||
--stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir | ||
fi | ||
|
||
if [ $stage -le 4 ] && $run_rescore; then | ||
echo Perform lattice-rescoring on $ac_model_dir | ||
LM=fsh_sw1_tg | ||
for decode_set in eval2000; do | ||
decode_dir=${ac_model_dir}/decode_${decode_set}_$LM | ||
|
||
# Lattice rescoring | ||
rnnlm/lmrescore_rnnlm_lat.sh \ | ||
--cmd "$decode_cmd --mem 4G" \ | ||
--weight 0.5 --max-ngram-order $ngram_order \ | ||
data/lang_$LM $dir \ | ||
data/${decode_set}_hires ${decode_dir} \ | ||
${decode_dir}_${decode_dir_suffix} | ||
done | ||
fi | ||
|
||
exit 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Be careful with WER numbers on the train_dev subset if you include all the training data here. You might want to use data/train_nodev as the LM training data, and data/train_dev as the dev data.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no data/train_nodev or train_dev folder there. I am not sure what you mean here..