kaldi-asr · danpovey · Nov 23, 2017 · Sep 18, 2017 · Sep 20, 2017 · Sep 22, 2017
diff --git a/egs/swbd/s5/rnnlm b/egs/swbd/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
diff --git a/egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1a.sh b/egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1a.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+
+# This script trains LMs on the swbd LM-training data.
+
+# Train objf: -299.20 -4.42 -4.24 -4.16 -4.10 -4.06 -4.03 -4.01 -3.98 -3.95 -3.94 -3.92 -3.90 -3.88 -3.86 -3.85 -3.84 -3.82 -3.81 -3.81 -3.79 -3.78 -3.77 -3.76 -3.74
+# Dev objf:   -10.65 -4.67 -4.37 -4.25 -4.19 -4.14 -4.10 -4.07 -4.03 -4.00 -3.99 -3.98 -3.96 -3.95 -3.93 -3.93 -3.91 -3.91 -3.90 -3.90 -3.88 -3.88 -3.87 -3.87 -3.86
+
+# Begin configuration section.
+
+dir=exp/rnnlm_lstm_1a
+embedding_dim=2048
+lstm_rpd=512
+lstm_nrpd=512
+stage=-10
+train_stage=-10
+
+# variables for lattice rescoring
+run_rescore=false
+ac_model_dir=exp/chain/tdnn_lstm_1e_sp
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+
+. cmd.sh
+. utils/parse_options.sh
+
+text=data/train/text
+lexicon=data/local/dict_nosp/lexiconp.txt
+text_dir=data/rnnlm/text_nosp
+mkdir -p $dir/config
+set -e
+
+for f in $text $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 500 lines as dev data.
+  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp data/lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+swbd   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \
+                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
+fi
+
+if [ $stage -le 4 ] && $run_rescore; then
+  echo Perform lattice-rescoring on $ac_model_dir
+  LM=fsh_sw1_tg
+  for decode_set in eval2000; do
+    decode_dir=${ac_model_dir}/decode_${decode_set}_$LM
+
+    # Lattice rescoring
+    rnnlm/lmrescore_rnnlm_lat.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      data/lang_$LM $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      ${decode_dir}_${decode_dir_suffix}
+  done
+fi
+
+exit 0
diff --git a/egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1b.sh b/egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1b.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+
+# This script trains LMs on the swbd LM-training data.
+
+# Train objf: -342.40 -4.48 -4.30 -4.21 -4.16 -4.12 -4.08 -4.07 -4.04 -4.00 -3.99 -3.97 -3.95 -3.94 -3.92 -3.91 -3.90 -3.89 -3.88 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83
+# Dev objf:   -10.65 -4.72 -4.43 -4.31 -4.24 -4.18 -4.15 -4.12 -4.08 -4.06 -4.04 -4.02 -4.01 -3.99 -3.98 -3.97 -3.96 -3.95 -3.95 -3.94 -3.93 -3.92 -3.92 -3.91 -3.91
+
+# Begin configuration section.
+dir=exp/rnnlm_lstm_1b
+embedding_dim=800
+lstm_rpd=200
+lstm_nrpd=200
+stage=-10
+train_stage=-10
+
+# variables for lattice rescoring
+run_rescore=false
+ac_model_dir=exp/chain/tdnn_lstm_1e_sp
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+
+. cmd.sh
+. utils/parse_options.sh
+
+text=data/train/text
+lexicon=data/local/dict_nosp/lexiconp.txt
+text_dir=data/rnnlm/text_nosp
+mkdir -p $dir/config
+set -e
+
+for f in $text $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 500 lines as dev data.
+  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp data/lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+swbd   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \
+                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
+fi
+
+if [ $stage -le 4 ] && $run_rescore; then
+  echo Perform lattice-rescoring on $ac_model_dir
+  LM=fsh_sw1_tg
+  for decode_set in eval2000; do
+    decode_dir=${ac_model_dir}/decode_${decode_set}_$LM
+
+    # Lattice rescoring
+    rnnlm/lmrescore_rnnlm_lat.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      data/lang_$LM $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      ${decode_dir}_${decode_dir_suffix}
+  done
+fi
+
+exit 0
diff --git a/egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1c.sh b/egs/swbd/s5c/local/rnnlm/tuning/run_lstm_1c.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+
+# This script trains LMs on the swbd LM-training data.
+
+# Train objf: -341.90 -4.45 -4.27 -4.19 -4.13 -4.09 -4.05 -4.04 -4.01 -3.98 -3.96 -3.95 -3.93 -3.91 -3.90 -3.89 -3.88 -3.86 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80
+# Dev objf:   -10.65 -4.68 -4.40 -4.28 -4.21 -4.16 -4.13 -4.10 -4.07 -4.04 -4.02 -4.00 -3.99 -3.97 -3.96 -3.95 -3.94 -3.94 -3.92 -3.92 -3.91 -3.90 -3.90 -3.89 -3.89
+
+# Begin configuration section.
+dir=exp/rnnlm_lstm_1c
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=-10
+train_stage=-10
+
+# variables for lattice rescoring
+run_rescore=false
+ac_model_dir=exp/chain/tdnn_lstm_1e_sp
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+
+. cmd.sh
+. utils/parse_options.sh
+
+text=data/train/text
+lexicon=data/local/dict_nosp/lexiconp.txt
+text_dir=data/rnnlm/text_nosp
+mkdir -p $dir/config
+set -e
+
+for f in $text $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 500 lines as dev data.
+  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp data/lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+swbd   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \
+                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
+fi
+
+if [ $stage -le 4 ] && $run_rescore; then
+  echo Perform lattice-rescoring on $ac_model_dir
+  LM=fsh_sw1_tg
+  for decode_set in eval2000; do
+    decode_dir=${ac_model_dir}/decode_${decode_set}_$LM
+
+    # Lattice rescoring
+    rnnlm/lmrescore_rnnlm_lat.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      data/lang_$LM $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      ${decode_dir}_${decode_dir_suffix}
+  done
+fi
+
+exit 0