[egs] Remove pitch from multi_cn nnet3 recipe (#3686)

kaldi-asr · Oct 28, 2019 · e3a1b99 · e3a1b99
1 parent e2c006c
commit e3a1b99
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 71 deletions.
diff --git a/egs/multi_cn/s5/RESULTS b/egs/multi_cn/s5/RESULTS
@@ -6,35 +6,35 @@
 %WER 19.03 [ 19941 / 104765, 725 ins, 1222 del, 17994 sub ] exp/tri3a/decode_aishell_test_tg/cer_13_0.5
 %WER 21.68 [ 22710 / 104765, 902 ins, 2361 del, 19447 sub ] exp/tri4a/decode_aishell_test_tg/cer_14_0.0
 %WER 16.64 [ 17436 / 104765, 857 ins, 706 del, 15873 sub ] exp/tri4a_cleaned/decode_aishell_test_tg/cer_14_0.5
-%WER 6.01 [ 6299 / 104765, 129 ins, 175 del, 5995 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aishell_tg/cer_11_1.0
-%WER 6.01 [ 6298 / 104765, 128 ins, 176 del, 5994 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aishell_tg/cer_11_1.0
+%WER 5.90 [ 6176 / 104765, 119 ins, 169 del, 5888 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aishell_tg/cer_11_1.0
+%WER 5.90 [ 6177 / 104765, 121 ins, 168 del, 5888 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aishell_tg/cer_11_1.0
 
 # aidatatang test set results
 %WER 33.86 [ 158799 / 468933, 3856 ins, 33811 del, 121132 sub ] exp/tri1b/decode_aidatatang_test_tg/cer_14_0.0
 %WER 32.62 [ 152977 / 468933, 4182 ins, 31249 del, 117546 sub ] exp/tri2a/decode_aidatatang_test_tg/cer_14_0.0
 %WER 23.67 [ 111009 / 468933, 4535 ins, 19118 del, 87356 sub ] exp/tri3a/decode_aidatatang_test_tg/cer_14_0.0
 %WER 20.01 [ 93829 / 468933, 4563 ins, 16970 del, 72296 sub ] exp/tri4a/decode_aidatatang_test_tg/cer_15_0.0
 %WER 17.85 [ 83717 / 468933, 6506 ins, 13716 del, 63495 sub ] exp/tri4a_cleaned/decode_aidatatang_test_tg/cer_15_0.0
-%WER 4.99 [ 23403 / 468933, 1954 ins, 3371 del, 18078 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aidatatang_tg/cer_11_0.0
-%WER 4.99 [ 23385 / 468933, 1965 ins, 3356 del, 18064 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aidatatang_tg/cer_11_0.0
+%WER 4.98 [ 23370 / 468933, 2190 ins, 3188 del, 17992 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aidatatang_tg/cer_10_0.0
+%WER 4.98 [ 23371 / 468933, 2224 ins, 3171 del, 17976 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aidatatang_tg/cer_10_0.0
 
 # magicdata test set results
 %WER 27.01 [ 64815 / 239927, 4838 ins, 14852 del, 45125 sub ] exp/tri1b/decode_magicdata_test_tg/cer_17_0.0
 %WER 27.10 [ 65010 / 239927, 5746 ins, 12552 del, 46712 sub ] exp/tri2a/decode_magicdata_test_tg/cer_17_0.0
 %WER 22.42 [ 53784 / 239927, 6513 ins, 7409 del, 39862 sub ] exp/tri3a/decode_magicdata_test_tg/cer_17_0.0
 %WER 15.45 [ 37076 / 239927, 3942 ins, 5217 del, 27917 sub ] exp/tri4a/decode_magicdata_test_tg/cer_17_0.0
 %WER 13.99 [ 33568 / 239927, 6267 ins, 3705 del, 23596 sub ] exp/tri4a_cleaned/decode_magicdata_test_tg/cer_17_0.5
-%WER 4.21 [ 10112 / 239927, 1443 ins, 1927 del, 6742 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_magicdata_tg/cer_11_0.5
-%WER 4.23 [ 10158 / 239927, 1299 ins, 2032 del, 6827 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_magicdata_tg/cer_11_1.0
+%WER 4.24 [ 10180 / 239927, 1405 ins, 2001 del, 6774 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_magicdata_tg/cer_11_1.0
+%WER 4.25 [ 10188 / 239927, 1428 ins, 1997 del, 6763 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_magicdata_tg/cer_11_1.0
 
 # thchs test set results
 %WER 35.75 [ 29005 / 81139, 353 ins, 1824 del, 26828 sub ] exp/tri1b/decode_thchs_test_tg/cer_10_1.0
 %WER 32.59 [ 26446 / 81139, 326 ins, 1622 del, 24498 sub ] exp/tri2a/decode_thchs_test_tg/cer_11_0.5
 %WER 30.26 [ 24549 / 81139, 328 ins, 1412 del, 22809 sub ] exp/tri3a/decode_thchs_test_tg/cer_10_1.0
 %WER 27.67 [ 22449 / 81139, 410 ins, 1102 del, 20937 sub ] exp/tri4a/decode_thchs_test_tg/cer_10_0.5
 %WER 25.41 [ 20615 / 81139, 399 ins, 847 del, 19369 sub ] exp/tri4a_cleaned/decode_thchs_test_tg/cer_11_0.5
-%WER 13.02 [ 10561 / 81139, 134 ins, 261 del, 10166 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_thchs_tg/cer_9_1.0
-%WER 13.00 [ 10552 / 81139, 132 ins, 259 del, 10161 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_thchs_tg/cer_9_1.0
+%WER 12.96 [ 10514 / 81139, 120 ins, 300 del, 10094 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_thchs_tg/cer_10_1.0
+%WER 12.94 [ 10499 / 81139, 120 ins, 299 del, 10080 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_thchs_tg/cer_10_1.0
 
 # GMM results w/ corpus LM
 # ./run.sh --stage 17 --corpus-lm true

diff --git a/egs/multi_cn/s5/local/chain/compare_cer.sh b/egs/multi_cn/s5/local/chain/compare_cer.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+# This script is modified from egs/librispeech/s5/local/chain/compare_wer.sh
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_cer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_cer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_cer.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "# WER on aidatatang(tg)      "
+  "# WER on aishell(tg)         "
+  "# WER on magicdata(tg)       "
+  "# WER on thchs30(tg)         ")
+
+for n in 0 1 2 3; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(aidatatang_tg aishell_tg magicdata_tg thchs_tg)
+
+     wer=$(grep WER $dirname/decode_${decode_names[$n]}/cer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(grep WER ${dirname}_online/decode_${decode_names[$n]}/cer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-parameters             "
+for x in $*; do
+  num_params=$(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+  printf "% 10d" $num_params
+done
+echo
diff --git a/egs/multi_cn/s5/local/chain/run_ivector_common.sh b/egs/multi_cn/s5/local/chain/run_ivector_common.sh
@@ -75,29 +75,22 @@ if [ $stage -le 3 ]; then
 
   # do volume-perturbation on the training data prior to extracting hires
   # features; this helps make trained nnets more invariant to test data volume.
-  # create MFCC data dir without pitch to extract iVector
   utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
-  steps/make_mfcc_pitch_online.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+  steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
     --cmd "$train_cmd" data/${train_set}_sp_hires || exit 1;
   steps/compute_cmvn_stats.sh data/${train_set}_sp_hires || exit 1;
   utils/fix_data_dir.sh data/${train_set}_sp_hires
-  utils/data/limit_feature_dim.sh 0:39 \
-    data/${train_set}_sp_hires data/${train_set}_sp_hires_nopitch || exit 1;
-  steps/compute_cmvn_stats.sh data/${train_set}_sp_hires_nopitch || exit 1;
 
   for datadir in $test_sets; do
-    steps/make_mfcc_pitch_online.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+    steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/$datadir/test_hires || exit 1;
     steps/compute_cmvn_stats.sh data/$datadir/test_hires || exit 1;
     utils/fix_data_dir.sh data/$datadir/test_hires
-    utils/data/limit_feature_dim.sh 0:39 \
-      data/$datadir/test_hires data/$datadir/test_hires_nopitch || exit 1;
-    steps/compute_cmvn_stats.sh data/$datadir/test_hires_nopitch || exit 1;
   done
 
   # now create a data subset.  60k is 1/5th of the training dataset (around 200 hours).
-  utils/subset_data_dir.sh data/${train_set}_sp_hires_nopitch 60000 \
-    data/${train_set}_sp_hires_nopitch_60k
+  utils/subset_data_dir.sh data/${train_set}_sp_hires 60000 \
+    data/${train_set}_sp_hires_60k
 fi
 
 
@@ -107,24 +100,24 @@ if [ $stage -le 4 ]; then
   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
   temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
 
-  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
   num_utts=$[$num_utts_total/100]
-  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
-     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
   echo "$0: computing a PCA transform from the hires data."
   steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
       --splice-opts "--left-context=3 --right-context=3" \
       --max-utts 10000 --subsample 2 \
-       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
        exp/nnet3${nnet3_affix}/pca_transform
 
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
     --num-frames 700000 \
     --num-threads $num_threads_ubm \
-    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
     exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
@@ -135,7 +128,7 @@ if [ $stage -le 5 ]; then
   # we use just the 60k subset (about one fifth of the data, or 200 hours).
   echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    --num-processes $num_processes data/${train_set}_sp_hires_nopitch_60k \
+    --num-processes $num_processes data/${train_set}_sp_hires_60k \
     exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
@@ -154,18 +147,18 @@ if [ $stage -le 6 ]; then
   # having a larger number of speakers is helpful for generalization, and to
   # handle per-utterance decoding well (iVector starts at zero).
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_sp_hires_nopitch ${ivectordir}/${train_set}_sp_hires_nopitch_max2
+    data/${train_set}_sp_hires ${ivectordir}/${train_set}_sp_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
-    ${ivectordir}/${train_set}_sp_hires_nopitch_max2 exp/nnet3${nnet3_affix}/extractor \
+    ${ivectordir}/${train_set}_sp_hires_max2 exp/nnet3${nnet3_affix}/extractor \
     $ivectordir || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   echo "$0: extracting iVectors for test data"
   for data in $test_sets; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
-      data/${data}/test_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
+      data/${data}/test_hires exp/nnet3${nnet3_affix}/extractor \
       exp/nnet3${nnet3_affix}/ivectors_${data}_hires || exit 1;
   done
 fi

diff --git a/egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -1,34 +1,28 @@
 #!/bin/bash
 
 # This script is copied from librispeech/s5
+# In a previous version, pitch is used with hires mfcc, however,
+# removing pitch does not cause regression, and helps online
+# decoding, so pitch is removed in this recipe.
 
 # This is based on tdnn_1d_sp, but adding cnn as the front-end.
 # The cnn-tdnn-f (tdnn_cnn_1a_sp) outperforms the tdnn-f (tdnn_1d_sp).
 
-# bash local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1d_sp exp/chain_cleaned/tdnn_cnn_1a_sp/
-# System                         tdnn_1d_sp  tdnn_cnn_1a_sp
-# WER on dev(fglarge)               3.29          3.34
-# WER on dev(tglarge)               3.44          3.39
-# WER on dev(tgmed)                 4.22          4.29
-# WER on dev(tgsmall)               4.72          4.77
-# WER on dev_other(fglarge)         8.71          8.62
-# WER on dev_other(tglarge)         9.05          9.00
-# WER on dev_other(tgmed)          11.09         10.93
-# WER on dev_other(tgsmall)        12.13         12.02
-# WER on test(fglarge)              3.80          3.69
-# WER on test(tglarge)              3.89          3.80
-# WER on test(tgmed)                4.72          4.64
-# WER on test(tgsmall)              5.19          5.16
-# WER on test_other(fglarge)        8.76          8.71
-# WER on test_other(tglarge)        9.19          9.11
-# WER on test_other(tgmed)         11.22         11.00
-# WER on test_other(tgsmall)       12.24         12.16
-# Final train prob               -0.0378       -0.0420
-# Final valid prob               -0.0374       -0.0400
-# Final train prob (xent)        -0.6099       -0.6881
-# Final valid prob (xent)        -0.6353       -0.7180
-# Num-parameters                22623456      18100736
-
+# local/chain/compare_cer.sh --online exp/chain_cleaned/tdnn_cnn_1a_pitch_sp exp/chain_nopitch/tdnn_cnn_1a_sp
+# System                      tdnn_cnn_1a_pitch_sp tdnn_cnn_1a_sp
+# WER on aidatatang(tg)            4.99      4.98
+#             [online:]          4.99      4.98
+# WER on aishell(tg)               6.01      5.90
+#             [online:]          6.01      5.90
+# WER on magicdata(tg)             4.21      4.24
+#             [online:]          4.23      4.25
+# WER on thchs30(tg)              13.02     12.96
+#             [online:]         13.00     12.94
+# Final train prob              -0.0436   -0.0438
+# Final valid prob              -0.0553   -0.0544
+# Final train prob (xent)       -0.8083   -0.8157
+# Final valid prob (xent)       -0.8766   -0.8730
+# Num-parameters               19141072  19141072
 
 set -e
 
@@ -53,6 +47,7 @@ common_egs_dir=
 xent_regularize=0.1
 dropout_schedule='0,0@0.20,0.5@0.50,0'
 
+test_sets=""
 test_online_decoding=true  # if true, it will run the last decoding stage.
 
 # End configuration section.
@@ -84,7 +79,7 @@ ali_dir=exp/${gmm}_ali_${train_set}_sp
 tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
 lang=data/lang_chain
 lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
-dir=exp/chain${nnet3_afqstfix}/tdnn${affix:+_$affix}_sp
+dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp
 train_data_dir=data/${train_set}_sp_hires
 lores_train_data_dir=data/${train_set}_sp
 train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
@@ -126,7 +121,7 @@ if [ $stage -le 14 ]; then
 
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
-  input dim=43 name=input
+  input dim=40 name=input
 
   # MFCC to filterbank
   idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
@@ -237,7 +232,7 @@ if $test_online_decoding && [ $stage -le 18 ]; then
   # note: if the features change (e.g. you add pitch features), you will have to
   # change the options of the following command line.
   steps/online/nnet3/prepare_online_decoding.sh \
-    --mfcc-config conf/mfcc_hires.conf --add-pitch true \
+    --mfcc-config conf/mfcc_hires.conf \
     $lang exp/nnet3${nnet3_affix}/extractor $dir ${dir}_online
 
   rm $dir/.error 2>/dev/null || true