[egs] Update WSJ flat-start chain recipes to use TDNN-F not TDNN+LSTM (…

…kaldi-asr#2988)
desh2608 · Jan 12, 2019 · c017268 · c017268
1 parent 9b6fbdd
commit c017268
Show file tree

Hide file tree

Showing 4 changed files with 295 additions and 59 deletions.
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
@@ -3,33 +3,31 @@
 
 # This script performs chain training in a flat-start manner
 # and without building or using any context-dependency tree.
-# It does not use ivecors or other forms of speaker adaptation
-# except simple mean and variance normalization.
+# It does not use ivecors or other forms of speaker adaptation.
 # It is called from run_e2e_phone.sh
 
 # Note: this script is configured as phone-based, if you want
 # to run it in character mode, you'll need to change _nosp
-# to _char everywhere and also copy char_lm.fst instead
-# of phone_lm.fst (in stage 1 below)
-
-# local/chain/compare_wer.sh exp/chain/e2e_tdnn_1a
-# System                   e2e_tdnn_1a
-#WER dev93 (tgpr)                9.63
-#WER dev93 (tg)                  9.07
-#WER dev93 (big-dict,tgpr)       7.41
-#WER dev93 (big-dict,fg)         6.55
-#WER eval92 (tgpr)               5.90
-#WER eval92 (tg)                 5.17
-#WER eval92 (big-dict,tgpr)      3.56
-#WER eval92 (big-dict,fg)        2.85
-# Final train prob        -0.0726
-# Final valid prob        -0.0884
+# to _char everywhere.
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnnf_1a
+# System                e2e_tdnnf_1a
+#WER dev93 (tgpr)                8.77
+#WER dev93 (tg)                  8.11
+#WER dev93 (big-dict,tgpr)       6.17
+#WER dev93 (big-dict,fg)         5.66
+#WER eval92 (tgpr)               5.62
+#WER eval92 (tg)                 5.19
+#WER eval92 (big-dict,tgpr)      3.23
+#WER eval92 (big-dict,fg)        2.80
+# Final train prob        -0.0618
+# Final valid prob        -0.0825
 # Final train prob (xent)
 # Final valid prob (xent)
-# Num-params                 3740934
+# Num-params                 6772564
 
-# steps/info/chain_dir_info.pl exp/chain/e2e_tdnn_1a
-# exp/chain/e2e_tdnn_1a: num-iters=102 nj=2..5 num-params=3.7M dim=40->84 combine=-0.117->-0.116 (over 3) logprob:train/valid[67,101,final]=(-0.080,-0.073,-0.073/-0.090,-0.089,-0.088)
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_1a
+# exp/chain/e2e_tdnnf_1a: num-iters=180 nj=2..8 num-params=6.8M dim=40->84 combine=-0.060->-0.060 (over 3) logprob:train/valid[119,179,final]=(-0.080,-0.062,-0.062/-0.089,-0.083,-0.083)
 
 set -e
 
@@ -40,15 +38,15 @@ get_egs_stage=-10
 affix=1a
 
 # training options
-num_epochs=4
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
 num_jobs_initial=2
-num_jobs_final=5
-minibatch_size=150=128,64/300=100,64,32/600=50,32,16/1200=16,8
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
 common_egs_dir=
 l2_regularize=0.00005
-dim=450
 frames_per_iter=3000000
-cmvn_opts="--norm-means=true --norm-vars=true"
+cmvn_opts="--norm-means=false --norm-vars=false"
 train_set=train_si284_spe2e_hires
 test_sets="test_dev93 test_eval92"
 
@@ -69,7 +67,7 @@ fi
 
 lang=data/lang_e2e
 treedir=exp/chain/e2e_tree  # it's actually just a trivial tree (no tree building)
-dir=exp/chain/e2e_tdnn_${affix}
+dir=exp/chain/e2e_tdnnf_${affix}
 
 if [ $stage -le 0 ]; then
   # Create a version of the lang/ directory that has one state per phone in the
@@ -102,25 +100,35 @@ fi
 if [ $stage -le 2 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-  opts="l2-regularize=0.01"
-  output_opts="l2-regularize=0.0025"
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
 
   input dim=40 name=input
 
-  relu-batchnorm-layer name=tdnn1 input=Append(-1,0,1) dim=$dim
-  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn3 dim=$dim $opts
-  relu-batchnorm-layer name=tdnn4 input=Append(-1,0,1) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn5 dim=$dim $opts
-  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$dim $opts
-
-  relu-batchnorm-layer name=prefinal-chain dim=$dim target-rms=0.5 $opts
-  output-layer name=output include-log-softmax=true dim=$num_targets $output_opts
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
 
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
@@ -139,14 +147,15 @@ if [ $stage -le 3 ]; then
     --egs.dir "$common_egs_dir" \
     --egs.stage $get_egs_stage \
     --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
     --trainer.num-chunk-per-minibatch $minibatch_size \
     --trainer.frames-per-iter $frames_per_iter \
     --trainer.num-epochs $num_epochs \
     --trainer.optimization.momentum 0 \
     --trainer.optimization.num-jobs-initial $num_jobs_initial \
     --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
     --trainer.optimization.shrink-value 1.0 \
     --trainer.max-param-change 2.0 \
     --cleanup.remove-egs true \

diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
@@ -6,31 +6,32 @@
 # a full trivial biphone context-dependency tree. This is because this recipe is
 # meant for character-based (i.e. lexicon-free) modeling where context helps
 # significantly.
-# It does not use ivecors or other forms of speaker adaptation
-# except simple mean and variance normalization.
+# It does not use ivecors or other forms of speaker adaptation.
 # It is called from run_e2e_char.sh
 
 # Note: this script is configured to run as character-based, if you want
 # to run it in phoneme mode, you'll need to change _char
-# to _nosp everywhere and also copy phone_lm.fst instead
-# of char_lm.fst (in stage 1 below)
+# to _nosp everywhere.
 
 
+# local/chain/compare_wer.sh exp/chain/e2e_tdnn_lstm_bichar_1a
 # System                e2e_tdnn_lstm_bichar_1a
-# WER dev93 (tgpr)                9.42
-# WER dev93 (tg)                  8.85
-# WER dev93 (big-dict,tgpr)       7.70
-# WER dev93 (big-dict,fg)         6.79
-# WER eval92 (tgpr)               6.42
-# WER eval92 (tg)                 6.11
-# WER eval92 (big-dict,tgpr)      4.50
-# WER eval92 (big-dict,fg)        4.09
-# Final train prob        -0.7535
-# Final valid prob        -0.7786
+#WER dev93 (tgpr)                9.85
+#WER dev93 (tg)                  9.32
+#WER dev93 (big-dict,tgpr)       8.19
+#WER dev93 (big-dict,fg)         7.27
+#WER eval92 (tgpr)               6.89
+#WER eval92 (tg)                 6.70
+#WER eval92 (big-dict,tgpr)      5.14
+#WER eval92 (big-dict,fg)        4.29
+# Final train prob        -0.0610
+# Final valid prob        -0.0836
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 9219188
 
 # steps/info/chain_dir_info.pl exp/chain/e2e_tdnn_lstm_bichar_1a/
-# exp/chain/e2e_tdnn_lstm_bichar_1a/: num-iters=138 nj=2..5 num-params=9.2M dim=40->3444 combine=-6.480->-6.478 logprob:train/valid[91,137,final]=(-0.766,-0.754,-0.754/-0.784,-0.779,-0.779)
-
+# exp/chain/e2e_tdnn_lstm_bichar_1a_nocmvn: num-iters=138 nj=2..5 num-params=9.2M dim=40->3444 combine=-1.211->-1.211 (over 3) logprob:train/valid[91,137,final]=(-0.079,-0.062,-0.061/-0.093,-0.084,-0.084)
 
 set -e
 
@@ -50,7 +51,7 @@ common_egs_dir=
 l2_regularize=0.00001
 dim=512
 frames_per_iter=2500000
-cmvn_opts="--norm-means=true --norm-vars=true"
+cmvn_opts="--norm-means=false --norm-vars=false"
 train_set=train_si284_spe2e_hires
 test_sets="test_dev93 test_eval92"