Skip to content

Commit

Permalink
[egs] Remove pitch from multi_cn nnet3 recipe (#3686)
Browse files Browse the repository at this point in the history
  • Loading branch information
naxingyu authored and danpovey committed Oct 28, 2019
1 parent e2c006c commit e3a1b99
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 71 deletions.
16 changes: 8 additions & 8 deletions egs/multi_cn/s5/RESULTS
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,35 @@
%WER 19.03 [ 19941 / 104765, 725 ins, 1222 del, 17994 sub ] exp/tri3a/decode_aishell_test_tg/cer_13_0.5
%WER 21.68 [ 22710 / 104765, 902 ins, 2361 del, 19447 sub ] exp/tri4a/decode_aishell_test_tg/cer_14_0.0
%WER 16.64 [ 17436 / 104765, 857 ins, 706 del, 15873 sub ] exp/tri4a_cleaned/decode_aishell_test_tg/cer_14_0.5
%WER 6.01 [ 6299 / 104765, 129 ins, 175 del, 5995 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aishell_tg/cer_11_1.0
%WER 6.01 [ 6298 / 104765, 128 ins, 176 del, 5994 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aishell_tg/cer_11_1.0
%WER 5.90 [ 6176 / 104765, 119 ins, 169 del, 5888 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aishell_tg/cer_11_1.0
%WER 5.90 [ 6177 / 104765, 121 ins, 168 del, 5888 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aishell_tg/cer_11_1.0

# aidatatang test set results
%WER 33.86 [ 158799 / 468933, 3856 ins, 33811 del, 121132 sub ] exp/tri1b/decode_aidatatang_test_tg/cer_14_0.0
%WER 32.62 [ 152977 / 468933, 4182 ins, 31249 del, 117546 sub ] exp/tri2a/decode_aidatatang_test_tg/cer_14_0.0
%WER 23.67 [ 111009 / 468933, 4535 ins, 19118 del, 87356 sub ] exp/tri3a/decode_aidatatang_test_tg/cer_14_0.0
%WER 20.01 [ 93829 / 468933, 4563 ins, 16970 del, 72296 sub ] exp/tri4a/decode_aidatatang_test_tg/cer_15_0.0
%WER 17.85 [ 83717 / 468933, 6506 ins, 13716 del, 63495 sub ] exp/tri4a_cleaned/decode_aidatatang_test_tg/cer_15_0.0
%WER 4.99 [ 23403 / 468933, 1954 ins, 3371 del, 18078 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aidatatang_tg/cer_11_0.0
%WER 4.99 [ 23385 / 468933, 1965 ins, 3356 del, 18064 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aidatatang_tg/cer_11_0.0
%WER 4.98 [ 23370 / 468933, 2190 ins, 3188 del, 17992 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aidatatang_tg/cer_10_0.0
%WER 4.98 [ 23371 / 468933, 2224 ins, 3171 del, 17976 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aidatatang_tg/cer_10_0.0

# magicdata test set results
%WER 27.01 [ 64815 / 239927, 4838 ins, 14852 del, 45125 sub ] exp/tri1b/decode_magicdata_test_tg/cer_17_0.0
%WER 27.10 [ 65010 / 239927, 5746 ins, 12552 del, 46712 sub ] exp/tri2a/decode_magicdata_test_tg/cer_17_0.0
%WER 22.42 [ 53784 / 239927, 6513 ins, 7409 del, 39862 sub ] exp/tri3a/decode_magicdata_test_tg/cer_17_0.0
%WER 15.45 [ 37076 / 239927, 3942 ins, 5217 del, 27917 sub ] exp/tri4a/decode_magicdata_test_tg/cer_17_0.0
%WER 13.99 [ 33568 / 239927, 6267 ins, 3705 del, 23596 sub ] exp/tri4a_cleaned/decode_magicdata_test_tg/cer_17_0.5
%WER 4.21 [ 10112 / 239927, 1443 ins, 1927 del, 6742 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_magicdata_tg/cer_11_0.5
%WER 4.23 [ 10158 / 239927, 1299 ins, 2032 del, 6827 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_magicdata_tg/cer_11_1.0
%WER 4.24 [ 10180 / 239927, 1405 ins, 2001 del, 6774 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_magicdata_tg/cer_11_1.0
%WER 4.25 [ 10188 / 239927, 1428 ins, 1997 del, 6763 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_magicdata_tg/cer_11_1.0

# thchs test set results
%WER 35.75 [ 29005 / 81139, 353 ins, 1824 del, 26828 sub ] exp/tri1b/decode_thchs_test_tg/cer_10_1.0
%WER 32.59 [ 26446 / 81139, 326 ins, 1622 del, 24498 sub ] exp/tri2a/decode_thchs_test_tg/cer_11_0.5
%WER 30.26 [ 24549 / 81139, 328 ins, 1412 del, 22809 sub ] exp/tri3a/decode_thchs_test_tg/cer_10_1.0
%WER 27.67 [ 22449 / 81139, 410 ins, 1102 del, 20937 sub ] exp/tri4a/decode_thchs_test_tg/cer_10_0.5
%WER 25.41 [ 20615 / 81139, 399 ins, 847 del, 19369 sub ] exp/tri4a_cleaned/decode_thchs_test_tg/cer_11_0.5
%WER 13.02 [ 10561 / 81139, 134 ins, 261 del, 10166 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_thchs_tg/cer_9_1.0
%WER 13.00 [ 10552 / 81139, 132 ins, 259 del, 10161 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_thchs_tg/cer_9_1.0
%WER 12.96 [ 10514 / 81139, 120 ins, 300 del, 10094 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_thchs_tg/cer_10_1.0
%WER 12.94 [ 10499 / 81139, 120 ins, 299 del, 10080 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_thchs_tg/cer_10_1.0

# GMM results w/ corpus LM
# ./run.sh --stage 17 --corpus-lm true
Expand Down
122 changes: 122 additions & 0 deletions egs/multi_cn/s5/local/chain/compare_cer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/bin/bash

# This script is modified from egs/librispeech/s5/local/chain/compare_wer.sh

# this script is used for comparing decoding results between systems.
# e.g. local/chain/compare_cer.sh exp/chain/tdnn_{c,d}_sp
# For use with discriminatively trained systems you specify the epochs after a colon:
# for instance,
# local/chain/compare_cer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}


if [ $# == 0 ]; then
echo "Usage: $0: [--online] <dir1> [<dir2> ... ]"
echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
echo "or (with epoch numbers for discriminative training):"
echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
exit 1
fi

echo "# $0 $*"

include_online=false
if [ "$1" == "--online" ]; then
include_online=true
shift
fi


used_epochs=false

# this function set_names is used to separate the epoch-related parts of the name
# [for discriminative training] and the regular parts of the name.
# If called with a colon-free directory name, like:
# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
# If called with something like:
# set_names exp/chain/tdnn_d_sp_smbr:3
# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"


set_names() {
if [ $# != 1 ]; then
echo "compare_cer.sh: internal error"
exit 1 # exit the program
fi
dirname=$(echo $1 | cut -d: -f1)
epoch=$(echo $1 | cut -s -d: -f2)
if [ -z $epoch ]; then
epoch_infix=""
else
used_epochs=true
epoch_infix=_epoch${epoch}
fi
}



echo -n "# System "
for x in $*; do printf "% 10s" " $(basename $x)"; done
echo

strings=(
"# WER on aidatatang(tg) "
"# WER on aishell(tg) "
"# WER on magicdata(tg) "
"# WER on thchs30(tg) ")

for n in 0 1 2 3; do
echo -n "${strings[$n]}"
for x in $*; do
set_names $x # sets $dirname and $epoch_infix
decode_names=(aidatatang_tg aishell_tg magicdata_tg thchs_tg)

wer=$(grep WER $dirname/decode_${decode_names[$n]}/cer_* | utils/best_wer.sh | awk '{print $2}')
printf "% 10s" $wer
done
echo
if $include_online; then
echo -n "# [online:] "
for x in $*; do
set_names $x # sets $dirname and $epoch_infix
wer=$(grep WER ${dirname}_online/decode_${decode_names[$n]}/cer_* | utils/best_wer.sh | awk '{print $2}')
printf "% 10s" $wer
done
echo
fi
done

echo -n "# Final train prob "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final valid prob "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final train prob (xent) "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final valid prob (xent) "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Num-parameters "
for x in $*; do
num_params=$(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
printf "% 10d" $num_params
done
echo
33 changes: 13 additions & 20 deletions egs/multi_cn/s5/local/chain/run_ivector_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,29 +75,22 @@ if [ $stage -le 3 ]; then

# do volume-perturbation on the training data prior to extracting hires
# features; this helps make trained nnets more invariant to test data volume.
# create MFCC data dir without pitch to extract iVector
utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
steps/make_mfcc_pitch_online.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/${train_set}_sp_hires || exit 1;
steps/compute_cmvn_stats.sh data/${train_set}_sp_hires || exit 1;
utils/fix_data_dir.sh data/${train_set}_sp_hires
utils/data/limit_feature_dim.sh 0:39 \
data/${train_set}_sp_hires data/${train_set}_sp_hires_nopitch || exit 1;
steps/compute_cmvn_stats.sh data/${train_set}_sp_hires_nopitch || exit 1;

for datadir in $test_sets; do
steps/make_mfcc_pitch_online.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/$datadir/test_hires || exit 1;
steps/compute_cmvn_stats.sh data/$datadir/test_hires || exit 1;
utils/fix_data_dir.sh data/$datadir/test_hires
utils/data/limit_feature_dim.sh 0:39 \
data/$datadir/test_hires data/$datadir/test_hires_nopitch || exit 1;
steps/compute_cmvn_stats.sh data/$datadir/test_hires_nopitch || exit 1;
done

# now create a data subset. 60k is 1/5th of the training dataset (around 200 hours).
utils/subset_data_dir.sh data/${train_set}_sp_hires_nopitch 60000 \
data/${train_set}_sp_hires_nopitch_60k
utils/subset_data_dir.sh data/${train_set}_sp_hires 60000 \
data/${train_set}_sp_hires_60k
fi


Expand All @@ -107,24 +100,24 @@ if [ $stage -le 4 ]; then
mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm

num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
num_utts=$[$num_utts_total/100]
utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
$num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
$num_utts ${temp_data_root}/${train_set}_sp_hires_subset

echo "$0: computing a PCA transform from the hires data."
steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
--max-utts 10000 --subsample 2 \
${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
${temp_data_root}/${train_set}_sp_hires_subset \
exp/nnet3${nnet3_affix}/pca_transform

echo "$0: training the diagonal UBM."
# Use 512 Gaussians in the UBM.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
--num-frames 700000 \
--num-threads $num_threads_ubm \
${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
${temp_data_root}/${train_set}_sp_hires_subset 512 \
exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
fi

Expand All @@ -135,7 +128,7 @@ if [ $stage -le 5 ]; then
# we use just the 60k subset (about one fifth of the data, or 200 hours).
echo "$0: training the iVector extractor"
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
--num-processes $num_processes data/${train_set}_sp_hires_nopitch_60k \
--num-processes $num_processes data/${train_set}_sp_hires_60k \
exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
fi

Expand All @@ -154,18 +147,18 @@ if [ $stage -le 6 ]; then
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
data/${train_set}_sp_hires_nopitch ${ivectordir}/${train_set}_sp_hires_nopitch_max2
data/${train_set}_sp_hires ${ivectordir}/${train_set}_sp_hires_max2

steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
${ivectordir}/${train_set}_sp_hires_nopitch_max2 exp/nnet3${nnet3_affix}/extractor \
${ivectordir}/${train_set}_sp_hires_max2 exp/nnet3${nnet3_affix}/extractor \
$ivectordir || exit 1;
fi

if [ $stage -le 7 ]; then
echo "$0: extracting iVectors for test data"
for data in $test_sets; do
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
data/${data}/test_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
data/${data}/test_hires exp/nnet3${nnet3_affix}/extractor \
exp/nnet3${nnet3_affix}/ivectors_${data}_hires || exit 1;
done
fi
Expand Down
49 changes: 22 additions & 27 deletions egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
Original file line number Diff line number Diff line change
@@ -1,34 +1,28 @@
#!/bin/bash

# This script is copied from librispeech/s5
# In a previous version, pitch is used with hires mfcc, however,
# removing pitch does not cause regression, and helps online
# decoding, so pitch is removed in this recipe.

# This is based on tdnn_1d_sp, but adding cnn as the front-end.
# The cnn-tdnn-f (tdnn_cnn_1a_sp) outperforms the tdnn-f (tdnn_1d_sp).

# bash local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1d_sp exp/chain_cleaned/tdnn_cnn_1a_sp/
# System tdnn_1d_sp tdnn_cnn_1a_sp
# WER on dev(fglarge) 3.29 3.34
# WER on dev(tglarge) 3.44 3.39
# WER on dev(tgmed) 4.22 4.29
# WER on dev(tgsmall) 4.72 4.77
# WER on dev_other(fglarge) 8.71 8.62
# WER on dev_other(tglarge) 9.05 9.00
# WER on dev_other(tgmed) 11.09 10.93
# WER on dev_other(tgsmall) 12.13 12.02
# WER on test(fglarge) 3.80 3.69
# WER on test(tglarge) 3.89 3.80
# WER on test(tgmed) 4.72 4.64
# WER on test(tgsmall) 5.19 5.16
# WER on test_other(fglarge) 8.76 8.71
# WER on test_other(tglarge) 9.19 9.11
# WER on test_other(tgmed) 11.22 11.00
# WER on test_other(tgsmall) 12.24 12.16
# Final train prob -0.0378 -0.0420
# Final valid prob -0.0374 -0.0400
# Final train prob (xent) -0.6099 -0.6881
# Final valid prob (xent) -0.6353 -0.7180
# Num-parameters 22623456 18100736

# local/chain/compare_cer.sh --online exp/chain_cleaned/tdnn_cnn_1a_pitch_sp exp/chain_nopitch/tdnn_cnn_1a_sp
# System tdnn_cnn_1a_pitch_sp tdnn_cnn_1a_sp
# WER on aidatatang(tg) 4.99 4.98
# [online:] 4.99 4.98
# WER on aishell(tg) 6.01 5.90
# [online:] 6.01 5.90
# WER on magicdata(tg) 4.21 4.24
# [online:] 4.23 4.25
# WER on thchs30(tg) 13.02 12.96
# [online:] 13.00 12.94
# Final train prob -0.0436 -0.0438
# Final valid prob -0.0553 -0.0544
# Final train prob (xent) -0.8083 -0.8157
# Final valid prob (xent) -0.8766 -0.8730
# Num-parameters 19141072 19141072

set -e

Expand All @@ -53,6 +47,7 @@ common_egs_dir=
xent_regularize=0.1
dropout_schedule='0,0@0.20,0.5@0.50,0'

test_sets=""
test_online_decoding=true # if true, it will run the last decoding stage.

# End configuration section.
Expand Down Expand Up @@ -84,7 +79,7 @@ ali_dir=exp/${gmm}_ali_${train_set}_sp
tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
lang=data/lang_chain
lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
dir=exp/chain${nnet3_afqstfix}/tdnn${affix:+_$affix}_sp
dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp
train_data_dir=data/${train_set}_sp_hires
lores_train_data_dir=data/${train_set}_sp
train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
Expand Down Expand Up @@ -126,7 +121,7 @@ if [ $stage -le 14 ]; then

cat <<EOF > $dir/configs/network.xconfig
input dim=100 name=ivector
input dim=43 name=input
input dim=40 name=input
# MFCC to filterbank
idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
Expand Down Expand Up @@ -237,7 +232,7 @@ if $test_online_decoding && [ $stage -le 18 ]; then
# note: if the features change (e.g. you add pitch features), you will have to
# change the options of the following command line.
steps/online/nnet3/prepare_online_decoding.sh \
--mfcc-config conf/mfcc_hires.conf --add-pitch true \
--mfcc-config conf/mfcc_hires.conf \
$lang exp/nnet3${nnet3_affix}/extractor $dir ${dir}_online

rm $dir/.error 2>/dev/null || true
Expand Down
Loading

0 comments on commit e3a1b99

Please sign in to comment.