diff --git a/egs/ami/s5/RESULTS_ihm b/egs/ami/s5/RESULTS_ihm index 6435e9df47b..667c6362da3 100644 --- a/egs/ami/s5/RESULTS_ihm +++ b/egs/ami/s5/RESULTS_ihm @@ -23,30 +23,46 @@ exp/ihm/tri4a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4. # Karel, JSALT 2015, (21.7.2015) -# dev, -## GMM, +# GMM - dev, %WER 38.1 | 13098 94489 | 67.1 20.6 12.2 5.2 38.1 67.0 | exp/ihm/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.sys %WER 35.5 | 13098 94487 | 69.6 19.0 11.4 5.1 35.5 65.8 | exp/ihm/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.sys %WER 32.2 | 13098 94483 | 72.5 17.2 10.3 4.8 32.2 63.8 | exp/ihm/tri4a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_14/dev.ctm.filt.sys #0.1% worse than Pawel! %WER 30.2 | 13098 94479 | 74.0 15.6 10.4 4.2 30.2 61.9 | exp/ihm/tri4a_mmi_b0.1/decode_dev_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/dev.ctm.filt.sys -## DNN-Xent, -%WER 26.0 | 13098 94483 | 77.9 13.5 8.5 4.0 26.0 58.4 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_11/dev.ctm.filt.sys -## DNN-sMBR, -%WER 24.9 | 13098 94484 | 79.2 13.2 7.6 4.1 24.9 57.1 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/dev.ctm.filt.sys -%WER 24.3 | 13098 94481 | 79.6 12.6 7.8 3.9 24.3 56.3 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_12/dev.ctm.filt.sys -# eval, -## GMM, +# GMM - eval, %WER 43.9 | 12643 89978 | 60.8 25.3 13.9 4.8 43.9 65.6 | exp/ihm/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys %WER 40.8 | 12643 89985 | 63.8 23.6 12.6 4.7 40.8 64.6 | exp/ihm/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys %WER 35.1 | 12643 89975 | 69.1 19.8 11.1 4.2 35.1 61.8 | exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys #0.1% worse than Pawel! %WER 31.7 | 12643 89986 | 72.1 18.0 9.9 3.8 31.7 59.4 | exp/ihm/tri4a_mmi_b0.1/decode_eval_4.mdl_ami_fsh.o3g.kn.pr1-7/ascore_11/eval.ctm.filt.sys + +# nnet1 DNN on 'fmllr' system, RBMs 6x2048 neurons (21.7.2015): ## DNN-Xent, +%WER 26.0 | 13098 94483 | 77.9 13.5 8.5 4.0 26.0 58.4 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_11/dev.ctm.filt.sys %WER 27.1 | 12643 89971 | 76.4 15.5 8.1 3.5 27.1 57.2 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_10/eval.ctm.filt.sys ## DNN-sMBR, -%WER 25.4 | 12643 89974 | 77.9 14.7 7.4 3.3 25.4 55.1 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/eval.ctm.filt.sys +%WER 24.3 | 13098 94481 | 79.6 12.6 7.8 3.9 24.3 56.3 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_12/dev.ctm.filt.sys %WER 24.6 | 12643 89972 | 78.8 14.1 7.1 3.3 24.6 54.4 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_11/eval.ctm.filt.sys +# nnet1 'fmllr + relu + dropout' system (27.9.2016): +## DNN-Xent, +%WER 26.3 | 13098 94489 | 77.6 13.6 8.7 3.9 26.3 59.2 | 0.005 | exp/ihm/dnn4d-6L1024-relu/decode_dev/ascore_11/dev.ctm.filt.sys +%WER 27.2 | 12643 89970 | 75.9 15.0 9.1 3.1 27.2 57.4 | 0.053 | exp/ihm/dnn4d-6L1024-relu/decode_eval/ascore_11/eval.ctm.filt.sys +# => about the same as the larger RBM system, +## DNN-sMBR, +%WER 25.1 | 13098 94474 | 78.6 12.7 8.6 3.7 25.1 57.6 | -0.193 | exp/ihm/dnn4d-6L1024-relu_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_15/dev.ctm.filt.sys +%WER 25.2 | 12643 89977 | 77.7 14.1 8.2 2.9 25.2 55.1 | -0.138 | exp/ihm/dnn4d-6L1024-relu_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_15/eval.ctm.filt.sys +# => on sMBR worse than the larger RBM system (tuning learning rate did not help), + +# nnet1 DNN on 'fmllr + kaldi i-vector per-spk' system (3.10.2016), RBMs 6x2048 neurons: +## DNN-Xent, +%WER 24.7 | 13098 94475 | 79.2 12.7 8.2 3.9 24.7 58.1 | -0.018 | exp/ihm/dnn4_pretrain-dbn-ivec_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_11/dev.ctm.filt.sys +%WER 25.2 | 12643 89972 | 78.0 13.8 8.2 3.2 25.2 56.4 | 0.057 | exp/ihm/dnn4_pretrain-dbn-ivec_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_11/eval.ctm.filt.sys +## DNN-sMBR, +%WER 23.2 | 13098 94477 | 80.3 11.5 8.2 3.6 23.2 56.3 | 0.010 | exp/ihm/dnn4_pretrain-dbn-ivec_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_13/dev.ctm.filt.sys +%WER 22.8 | 12643 89982 | 80.2 12.8 7.1 3.0 22.8 53.8 | 0.060 | exp/ihm/dnn4_pretrain-dbn-ivec_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_12/eval.ctm.filt.sys +# => The kaldi i-vectors on fMLLR feats are helping nicely, +# (WER not too far from the chain systems, even w/o data augmentation, but with fMLLR), + # Vijay, TDNN results, for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done diff --git a/egs/ami/s5/local/nnet/prepare_ivectors.sh b/egs/ami/s5/local/nnet/prepare_ivectors.sh new file mode 100755 index 00000000000..0bee63ec308 --- /dev/null +++ b/egs/ami/s5/local/nnet/prepare_ivectors.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Copyright 2016, Brno University of Technology (Author: Karel Vesely) +# Apache 2.0 + +. path.sh +. cmd.sh + +train=data_ihm-fmllr-tri4/ihm/train +dev=data_ihm-fmllr-tri4/ihm/dev +eval=data_ihm-fmllr-tri4/ihm/eval +gmm=exp/ihm/tri4a + +# Output directory for models and i-vectors, +ivec_absdir=$(readlink -m data_ihm-fmllr-tri4/ihm/ivector) + +. utils/parse_options.sh + +set -euxo pipefail + +# UBM training (1024 components), +ubm=$ivec_absdir/ubm +steps/nnet/ivector/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 \ + $train 1024 $ubm + +# Training i-vector extractor (100 dims), +iextractor=$ivec_absdir/iextractor +steps/nnet/ivector/train_ivector_extractor.sh --cmd "$train_cmd --mem 5G" --nj 10 \ + --ivector-dim 100 $train $ubm $iextractor + +# Extracting the i-vectors (per speaker, as the per-utterance copies), +steps/nnet/ivector/extract_ivectors.sh --cmd "$train_cmd" --nj 80 \ + $train data/lang $iextractor \ + ${gmm}_ali $ivec_absdir/ivec_train +steps/nnet/ivector/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \ + $dev data/lang $iextractor \ + $gmm/decode_dev_ami_fsh.o3g.kn.pr1-7 $ivec_absdir/ivec_dev +steps/nnet/ivector/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \ + $eval data/lang $iextractor \ + $gmm/decode_eval_ami_fsh.o3g.kn.pr1-7 $ivec_absdir/ivec_eval + + +# POST-PROCESS PER-SPEAKER I-VECTORS: + +# Get the global mean of the i-vectors (train), +ivector-mean scp:$ivec_absdir/ivec_train/ivectors_spk.scp $iextractor/global_mean + +# Merge the sets, normalize means, apply length normalization, +cat $ivec_absdir/ivec_train/ivectors_spk-as-utt.scp \ + $ivec_absdir/ivec_dev/ivectors_spk-as-utt.scp \ + $ivec_absdir/ivec_eval/ivectors_spk-as-utt.scp | \ + ivector-subtract-global-mean $iextractor/global_mean scp:- ark:- | \ + ivector-normalize-length --scaleup=false ark:- ark,scp:$ivec_absdir/ivectors_spk-as-utt_normalized.ark,$ivec_absdir/ivectors_spk-as-utt_normalized.scp + + +# POST-PROCESS PER-SENTENCE I-VECTORS: + +# Get the global mean of the i-vectors (train, per-sentence), +ivector-mean scp:$ivec_absdir/ivec_train/ivectors_utt.scp $iextractor/global_mean_utt + +# Merge the sets, normalize means, apply length normalization, +cat $ivec_absdir/ivec_train/ivectors_utt.scp \ + $ivec_absdir/ivec_dev/ivectors_utt.scp \ + $ivec_absdir/ivec_eval/ivectors_utt.scp | \ + ivector-subtract-global-mean $iextractor/global_mean_utt scp:- ark:- | \ + ivector-normalize-length --scaleup=false ark:- ark,scp:$ivec_absdir/ivectors_utt_normalized.ark,$ivec_absdir/ivectors_utt_normalized.scp + + +exit 0 # Done! diff --git a/egs/ami/s5/local/nnet/run_dnn_ivec.sh b/egs/ami/s5/local/nnet/run_dnn_ivec.sh new file mode 100755 index 00000000000..fb2531f6473 --- /dev/null +++ b/egs/ami/s5/local/nnet/run_dnn_ivec.sh @@ -0,0 +1,121 @@ +#!/bin/bash -u + +. ./cmd.sh +. ./path.sh + +# DNN training. This script is based on egs/swbd/s5b/local/run_dnn.sh +# Shinji Watanabe, Karel Vesely, + +# Config: +nj=80 +nj_decode=30 +stage=0 # resume training with --stage=N +. utils/parse_options.sh || exit 1; +# + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s [opts] \n\n" `basename $0` + exit 1; +fi +mic=$1 + +gmmdir=exp/$mic/tri4a +data_fmllr=data_${mic}-fmllr-tri4 + +final_lm=`cat data/local/lm/final_lm` +LM=$final_lm.pr1-7 +graph_dir=$gmmdir/graph_${LM} + +set -euxo pipefail + +# Store fMLLR features, so we can train on them easily, +if [ $stage -le 0 ]; then + # eval + dir=$data_fmllr/$mic/eval + steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \ + --transform-dir $gmmdir/decode_eval_${LM} \ + $dir data/$mic/eval $gmmdir $dir/log $dir/data + # dev + dir=$data_fmllr/$mic/dev + steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \ + --transform-dir $gmmdir/decode_dev_${LM} \ + $dir data/$mic/dev $gmmdir $dir/log $dir/data + # train + dir=$data_fmllr/$mic/train + steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \ + --transform-dir ${gmmdir}_ali \ + $dir data/$mic/train $gmmdir $dir/log $dir/data + # split the data : 90% train 10% cross-validation (held-out) + utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 +fi + +# Prepare the i-vectors, +if [ $stage -le 1 ]; then + local/nnet/prepare_ivectors.sh +fi + +# Pre-train DBN, i.e. a stack of RBMs, +ivector=scp:$data_fmllr/ihm/ivector/ivectors_spk-as-utt_normalized.scp +if [ $stage -le 2 ]; then + dir=exp/$mic/dnn4_pretrain-dbn-ivec + $cuda_cmd $dir/log/pretrain_dbn.log \ + steps/nnet/pretrain_dbn.sh --rbm-iter 1 --ivector $ivector \ + $data_fmllr/$mic/train $dir +fi + +# Train the DNN optimizing per-frame cross-entropy, +if [ $stage -le 3 ]; then + dir=exp/$mic/dnn4_pretrain-dbn-ivec_dnn + ali=${gmmdir}_ali + feature_transform=exp/$mic/dnn4_pretrain-dbn-ivec/final.feature_transform + dbn=exp/$mic/dnn4_pretrain-dbn-ivec/6.dbn + # Train + $cuda_cmd $dir/log/train_nnet.log \ + steps/nnet/train.sh --feature-transform $feature_transform --ivector $ivector \ + --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ + $data_fmllr/$mic/train_tr90 $data_fmllr/$mic/train_cv10 data/lang $ali $ali $dir + # Decode (reuse HCLG graph) + steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf --acwt 0.1 \ + --num-threads 3 --ivector $ivector \ + $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM} + steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf --acwt 0.1 \ + --num-threads 3 --ivector $ivector \ + $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM} +fi + +# Sequence training using sMBR criterion, we do Stochastic-GD with +# per-utterance updates. We use usually good acwt 0.1. +# Lattices are not regenerated (it is faster). + +dir=exp/$mic/dnn4_pretrain-dbn-ivec_dnn_smbr +srcdir=exp/$mic/dnn4_pretrain-dbn-ivec_dnn +acwt=0.1 + +# Generate lattices and alignments, +if [ $stage -le 4 ]; then + steps/nnet/align.sh --nj $nj --cmd "$train_cmd" --ivector $ivector \ + $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali + steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --ivector $ivector \ + --config conf/decode_dnn.conf --acwt $acwt \ + $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_denlats +fi + +# Re-train the DNN by 4 epochs of sMBR, +if [ $stage -le 5 ]; then + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ + --ivector $ivector \ + $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir + # Decode (reuse HCLG graph) + for ITER in 4 1; do + steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \ + --nnet $dir/${ITER}.nnet --acwt $acwt --ivector $ivector \ + $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM}_it${ITER} + steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \ + --nnet $dir/${ITER}.nnet --acwt $acwt --ivector $ivector \ + $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM}_it${ITER} + done +fi + +# Getting results [see RESULTS file] +# for x in exp/$mic/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done + diff --git a/egs/ami/s5/local/nnet/run_relu.sh b/egs/ami/s5/local/nnet/run_relu.sh new file mode 100755 index 00000000000..1ee9fe3a0b6 --- /dev/null +++ b/egs/ami/s5/local/nnet/run_relu.sh @@ -0,0 +1,119 @@ +#!/bin/bash -u + +. ./cmd.sh +. ./path.sh + +# DNN training. This script is based on egs/swbd/s5b/local/run_dnn.sh +# Shinji Watanabe, Karel Vesely, + +# Config: +nj=80 +nj_decode=30 +stage=0 # resume training with --stage=N +. utils/parse_options.sh || exit 1; +# + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s [opts] \n\n" `basename $0` + exit 1; +fi +mic=$1 + +gmmdir=exp/$mic/tri4a +data_fmllr=data_${mic}-fmllr-tri4 + +final_lm=`cat data/local/lm/final_lm` +LM=$final_lm.pr1-7 +graph_dir=$gmmdir/graph_${LM} + +set -euxo pipefail + +# Store fMLLR features, so we can train on them easily, +if [ $stage -le 0 -a ! -e $data_fmllr/$mic/eval ]; then + # eval + dir=$data_fmllr/$mic/eval + steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \ + --transform-dir $gmmdir/decode_eval_${LM} \ + $dir data/$mic/eval $gmmdir $dir/log $dir/data + # dev + dir=$data_fmllr/$mic/dev + steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \ + --transform-dir $gmmdir/decode_dev_${LM} \ + $dir data/$mic/dev $gmmdir $dir/log $dir/data + # train + dir=$data_fmllr/$mic/train + steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \ + --transform-dir ${gmmdir}_ali \ + $dir data/$mic/train $gmmdir $dir/log $dir/data + # split the data : 90% train 10% cross-validation (held-out) + utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 +fi + +train=data_ihm-fmllr-tri4/ihm/train +dev=data_ihm-fmllr-tri4/ihm/dev +eval=data_ihm-fmllr-tri4/ihm/eval + +lrate=0.00025 +param_std=0.02 +lr_alpha=1.0 +lr_beta=0.75 +dropout_schedule=0.2,0.2,0.2,0.2,0.2,0.0 +gmm=$gmmdir +graph=$graph_dir + +# Train 6 layer DNN from random initialization, +# - Parametric RELU, alphas+betas trained, +# - Dropout retention 0.8 in 5 initial epochs with fixed learning rate, +if [ $stage -le 1 ]; then + # Train the DNN optimizing per-frame cross-entropy. + dir=exp/$mic/dnn4d-6L1024-relu + ali=${gmm}_ali + # Train + $cuda_cmd $dir/log/train_nnet.log \ + steps/nnet/train.sh --learn-rate $lrate \ + --splice 5 --hid-layers 6 --hid-dim 1024 \ + --proto-opts "--activation-type= --activation-opts=_${lr_alpha}__${lr_beta} --param-stddev-factor $param_std --hid-bias-mean 0 --hid-bias-range 0 --with-dropout --no-glorot-scaled-stddev --no-smaller-input-weights" \ + --scheduler-opts "--keep-lr-iters 5 --dropout-schedule $dropout_schedule" \ + ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir + # Decode (reuse HCLG graph) + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --acwt 0.1 \ + $graph $dev $dir/decode_$(basename $dev) + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --acwt 0.1 \ + $graph $eval $dir/decode_$(basename $eval) +fi + +# Sequence training using sMBR criterion, we do Stochastic-GD with +# per-utterance updates. We use usually good acwt 0.1. +# Lattices are not regenerated (it is faster). + +dir=exp/$mic/dnn4d-6L1024-relu_smbr +srcdir=exp/$mic/dnn4d-6L1024-relu +acwt=0.1 + +# Generate lattices and alignments, +if [ $stage -le 3 ]; then + steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ + $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali + steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.conf \ + --acwt $acwt $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_denlats +fi + +# Re-train the DNN by 4 epochs of sMBR, +if [ $stage -le 4 ]; then + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ + --learn-rate 0.0000003 \ + $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir + # Decode (reuse HCLG graph) + for ITER in 4 1; do + steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM}_it${ITER} + steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM}_it${ITER} + done +fi + +# Getting results [see RESULTS file] +# for x in exp/$mic/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done + diff --git a/egs/wsj/s5/steps/nnet/ivector/extract_ivectors.sh b/egs/wsj/s5/steps/nnet/ivector/extract_ivectors.sh new file mode 100755 index 00000000000..36af3ab49d8 --- /dev/null +++ b/egs/wsj/s5/steps/nnet/ivector/extract_ivectors.sh @@ -0,0 +1,219 @@ +#!/bin/bash + +# Copyright 2013 Daniel Povey +# 2016 Brno University of Technology (author: Karel Vesely) +# Apache 2.0. + + +# This script computes iVectors in the same format as extract_ivectors_online.sh, +# except that they are actually not really computed online, they are first computed +# per speaker and just duplicated many times. +# This is mainly intended for use in decoding, where you want the best possible +# quality of iVectors. +# +# This setup also makes it possible to use a previous decoding or alignment, to +# down-weight silence in the stats (default is --silence-weight 0.0). +# +# This is for when you use the "online-decoding" setup in an offline task, and +# you want the best possible results. + + +# Begin configuration section. +nj=30 +cmd="run.pl" +stage=0 +num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select +min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) + +posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for + # inter-frame correlations. Making this small during iVector + # extraction is equivalent to scaling up the prior, and will + # will tend to produce smaller iVectors where data-counts are + # small. It's not so important that this match the value + # used when training the iVector extractor, but more important + # that this match the value used when you do real online decoding + # with the neural nets trained with these iVectors. + +max_count=100 # Interpret this as a number of frames times posterior scale... + # this config ensures that once the count exceeds this (i.e. + # 1000 frames, or 10 seconds, by default), we start to scale + # down the stats, accentuating the prior term. This seems quite + # important for some reason. + +silence_weight=0.0 +acwt=0.1 # used if input is a decode dir, to get best path from lattices. +mdl=final # change this if decode directory did not have ../final.mdl present. + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ] && [ $# != 5 ]; then + echo "Usage: $0 [options] [||] " + echo " e.g.: $0 data/test exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " # Ignored if or supplied." + echo " --stage # To control partial reruns" + echo " --num-gselect # Number of Gaussians to select using" + echo " # diagonal model." + echo " --min-post # Pruning threshold for posteriors" + echo " --ivector-period # How often to extract an iVector (frames)" + echo " --posterior-scale # Scale on posteriors in iVector extraction; " + echo " # affects strength of prior term." + + exit 1; +fi + +set -euxo pipefail + +if [ $# -eq 4 ]; then + data=$1 + lang=$2 + srcdir=$3 + dir=$4 +else # 5 arguments + data=$1 + lang=$2 + srcdir=$3 + ali_or_decode_dir=$4 + dir=$5 +fi + +for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $lang/phones.txt; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +mkdir -p $dir/log +silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + +if [ ! -z "$ali_or_decode_dir" ]; then + + if [ -f $ali_or_decode_dir/ali.1.gz ]; then + if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then + echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist." + exit 1; + fi + nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1; + + if [ $stage -le 0 ]; then + rm $dir/weights.*.gz 2>/dev/null || true + + $cmd JOB=1:$nj_orig $dir/log/ali_to_post.JOB.log \ + gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \ + ali-to-post ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \ + post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1; + + # put all the weights in one archive. + for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1; + rm $dir/weights.*.gz || exit 1; + fi + + elif [ -f $ali_or_decode_dir/lat.1.gz ]; then + nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1; + if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then + echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist." + exit 1; + fi + + + if [ $stage -le 0 ]; then + rm $dir/weights.*.gz 2>/dev/null || true + + $cmd JOB=1:$nj_orig $dir/log/lat_to_post.JOB.log \ + lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \ + ali-to-post ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \ + post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1; + + # put all the weights in one archive. + for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1; + rm $dir/weights.*.gz || exit 1; + fi + + elif [ -f $ali_or_decode_dir ] && gunzip -c $ali_or_decode_dir >/dev/null; then + cp $ali_or_decode_dir $dir/weights.gz || exit 1; + + else + echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir"; + exit 1; + fi +fi + +sdata=$data/split$nj; +utils/split_data.sh $data $nj || exit 1; + +gmm_feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" +feats="$gmm_feats" + +# (here originally was the sub-speaker hack), +this_sdata=$sdata + +# Per-speaker i-vectors, +if [ $stage -le 2 ]; then + if [ ! -z "$ali_or_decode_dir" ]; then + $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ + weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \ + ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \ + --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \ + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark + else + $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ + ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \ + --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \ + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark + fi +fi + +# Per-utterance i-vectors, +if [ $stage -le 3 ]; then + if [ ! -z "$ali_or_decode_dir" ]; then + $cmd JOB=1:$nj $dir/log/extract_ivectors_utt.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ + weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \ + ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true --max-count=$max_count \ + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_utt.JOB.ark + else + $cmd JOB=1:$nj $dir/log/extract_ivectors_utt.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \ + ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true --max-count=$max_count \ + $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_utt.JOB.ark + fi +fi + + +# get an utterance-level set of iVectors (just duplicate the speaker-level ones). +# note: if $this_sdata is set $dir/split$nj, then these won't be real speakers, they'll +# be "sub-speakers" (speakers split up into multiple utterances). +if [ $stage -le 4 ]; then + for j in $(seq $nj); do + utils/apply_map.pl -f 2 $dir/ivectors_spk.${j}.ark <$this_sdata/$j/utt2spk >$dir/ivectors_spk-as-utt.${j}.ark + done +fi + +ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] +echo "$0: iVector dim is $ivector_dim" + +absdir=$(readlink -f $dir) + +if [ $stage -le 5 ]; then + echo "$0: merging iVectors across jobs" + copy-vector "ark:cat $dir/ivectors_spk.*.ark |" ark,scp:$absdir/ivectors_spk.ark,$dir/ivectors_spk.scp + rm $dir/ivectors_spk.*.ark + copy-vector "ark:cat $dir/ivectors_spk-as-utt.*.ark |" ark,scp:$absdir/ivectors_spk-as-utt.ark,$dir/ivectors_spk-as-utt.scp + rm $dir/ivectors_spk-as-utt.*.ark + copy-vector "ark:cat $dir/ivectors_utt.*.ark |" ark,scp:$absdir/ivectors_utt.ark,$dir/ivectors_utt.scp + rm $dir/ivectors_utt.*.ark +fi + +echo "$0: done extracting iVectors (per-speaker, per-sentence) into '$dir'" + diff --git a/egs/wsj/s5/steps/nnet/ivector/train_diag_ubm.sh b/egs/wsj/s5/steps/nnet/ivector/train_diag_ubm.sh new file mode 100755 index 00000000000..ebd36a9e8e4 --- /dev/null +++ b/egs/wsj/s5/steps/nnet/ivector/train_diag_ubm.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# 2013 Daniel Povey +# 2016 Brno University of Technology (Author: Karel Vesely) +# Apache 2.0. + +# This script trains a diagonal UBM that we'll use in online iVector estimation, +# where the online-estimated iVector will be used as a secondary input to a deep +# neural net for single-pass DNN-based decoding. + +# This script was modified from ../../sre08/v1/sid/train_diag_ubm.sh. +# It trains a diagonal UBM on top of input features. We use the original features, +# assuming they are already normalized (or transformed). + +# This script does not use the trained model from the source directory to +# initialize the diagonal GMM; instead, we initialize the GMM using +# gmm-global-init-from-feats, which sets the means to random data points and +# then does some iterations of E-M in memory. After the in-memory +# initialization we train for a few iterations in parallel. +# Note that there is a slight mismatch in that the source LDA+MLLT matrix +# (final.mat) will have been estimated using standard CMVN, and we're using +# online CMVN. We don't think this will have much effect. + + +# Begin configuration section. +nj=4 +cmd=run.pl +num_iters=4 +stage=-2 +num_gselect=30 # Number of Gaussian-selection indices to use while training + # the model. +num_frames=500000 # number of frames to keep in memory for initialization +num_iters_init=20 +initial_gauss_proportion=0.5 # Start with half the target number of Gaussians +subsample=2 # subsample all features with this periodicity, in the main E-M phase. +cleanup=true +min_gaussian_weight=0.0001 +remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed. +num_threads=8 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 data/train 1024 exp/diag_ubm" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # number of parallel jobs to run." + echo " --num-iters # number of iterations of parallel " + echo " # training (default: $num_iters)" + echo " --stage # stage to do partial re-run from." + echo " --num-gselect # Number of Gaussians per frame to" + echo " # limit computation to, for speed" + echo " --subsample # In main E-M phase, use every n" + echo " # frames (a speedup)" + echo " --num-frames # Maximum num-frames to keep in memory" + echo " # for model initialization" + echo " --num-iters-init # Number of E-M iterations for model" + echo " # initialization" + echo " --initial-gauss-proportion # Proportion of Gaussians to start with" + echo " # in initialization phase (then split)" + echo " --num-threads # number of threads to use in initialization" + echo " # phase (must match with parallel-opts option)" + echo " --min-gaussian-weight # min Gaussian weight allowed in GMM" + echo " # initialization (this relatively high" + echo " # value keeps counts fairly even)" + exit 1; +fi + +set -euo pipefail + +data=$1 +num_gauss=$2 +dir=$3 + +! [ $num_gauss -gt 0 ] && echo "Bad num-gauss $num_gauss" && exit 1; + +sdata=$data/split$nj +mkdir -p $dir/log +utils/split_data.sh $data $nj || exit 1; + +for f in $data/feats.scp; do + [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1 +done + +# Note: there is no point subsampling all_feats, because gmm-global-init-from-feats +# effectively does subsampling itself (it keeps a random subset of the features). +all_feats="ark,s,cs:copy-feats scp:$data/feats.scp ark:- |" +feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |" + +num_gauss_init=$(perl -e "print int($initial_gauss_proportion * $num_gauss); "); +! [ $num_gauss_init -gt 0 ] && echo "Invalid num-gauss-init $num_gauss_init" && exit 1; + +if [ $stage -le -2 ]; then + echo "$0: initializing model from E-M in memory, " + echo "$0: starting from $num_gauss_init Gaussians, reaching $num_gauss;" + echo "$0: for $num_iters_init iterations, using at most $num_frames frames of data" + + $cmd --num-threads $num_threads $dir/log/gmm_init.log \ + gmm-global-init-from-feats --num-threads=$num_threads --num-frames=$num_frames \ + --min-gaussian-weight=$min_gaussian_weight \ + --num-gauss=$num_gauss --num-gauss-init=$num_gauss_init --num-iters=$num_iters_init \ + "$all_feats" $dir/0.dubm +fi + +# Store Gaussian selection indices on disk-- this speeds up the training passes. +if [ $stage -le -1 ]; then + echo "Getting Gaussian-selection info" + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + gmm-gselect --n=$num_gselect $dir/0.dubm "$feats" \ + "ark:|gzip -c >$dir/gselect.JOB.gz" +fi + +echo "$0: will train for $num_iters iterations, in parallel over" +echo "$0: $nj machines, parallelized with '$cmd'" + +for x in $(seq 0 $[$num_iters-1]); do + echo "$0: Training pass $x" + if [ $stage -le $x ]; then + # Accumulate stats. + $cmd JOB=1:$nj $dir/log/acc.${x}.JOB.log \ + gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \ + $dir/$x.dubm "$feats" $dir/$x.JOB.acc + if [ $x -lt $[$num_iters-1] ]; then # Don't remove low-count Gaussians till last iter, + opt="--remove-low-count-gaussians=false" # or gselect info won't be valid any more. + else + opt="--remove-low-count-gaussians=$remove_low_count_gaussians" + fi + $cmd $dir/log/update.${x}.log \ + gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/${x}.dubm "gmm-global-sum-accs - $dir/${x}.*.acc|" \ + $dir/$[$x+1].dubm + rm $dir/$x.*.acc $dir/$x.dubm + fi +done + +rm $dir/gselect.*.gz +mv $dir/$num_iters.dubm $dir/final.dubm + +exit 0 # Done! + diff --git a/egs/wsj/s5/steps/nnet/ivector/train_ivector_extractor.sh b/egs/wsj/s5/steps/nnet/ivector/train_ivector_extractor.sh new file mode 100755 index 00000000000..252035a525f --- /dev/null +++ b/egs/wsj/s5/steps/nnet/ivector/train_ivector_extractor.sh @@ -0,0 +1,179 @@ +#!/bin/bash + +# Copyright 2013 Daniel Povey +# 2016 Brno University of Technology (Author: Karel Vesely) +# Apache 2.0. + +# This script is modified from ^/egs/sre08/v1/sid/train_ivector_extractor.sh. +# It trains an iVector extractor for use in DNN training. + +# This script trains the i-vector extractor. Note: there are 3 separate levels +# of parallelization: num_threads, num_processes, and num_jobs. This may seem a +# bit excessive. It has to do with minimizing memory usage and disk I/O, +# subject to various constraints. The "num_threads" is how many threads a +# program uses; the "num_processes" is the number of separate processes a single +# job spawns, and then sums the accumulators in memory. Our recommendation: +# - Set num_threads to the minimum of (4, or how many virtual cores your machine has). +# (because of needing to lock various global quantities, the program can't +# use many more than 4 threads with good CPU utilization). +# - Set num_processes to the number of virtual cores on each machine you have, divided by +# num_threads. E.g. 4, if you have 16 virtual cores. If you're on a shared queue +# that's busy with other people's jobs, it may be wise to set it to rather less +# than this maximum though, or your jobs won't get scheduled. And if memory is +# tight you need to be careful; in our normal setup, each process uses about 5G. +# - Set num_jobs to as many of the jobs (each using $num_threads * $num_processes CPUs) +# your queue will let you run at one time, but don't go much more than 10 or 20, or +# summing the accumulators will possibly get slow. If you have a lot of data, you +# may want more jobs, though. + +# Begin configuration section. +nj=10 # this is the number of separate queue jobs we run, but each one + # contains num_processes sub-jobs.. the real number of threads we + # run is nj * num_processes * num_threads, and the number of + # separate pieces of data is nj * num_processes. +num_threads=4 +num_processes=2 # each job runs this many processes, each with --num-threads threads +cmd="run.pl" +stage=-4 +ivector_dim=100 # dimension of the extracted i-vector +num_iters=10 +num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select +posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for + # inter-frame correlations. +min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) + # caution: you should use the same value in the online-estimation + # code. +subsample=2 # This speeds up the training: training on every 2nd feature + # (configurable) Since the features are highly correlated across + # frames, we don't expect to lose too much from this. +parallel_opts= # ignored now. +cleanup=true +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 data/train exp/nnet2_online/diag_ubm/ exp/nnet2_online/extractor" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-iters <#iters|10> # Number of iterations of E-M" + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --num-processes # Number of processes for each queue job (relates" + echo " # to summing accs in memory)" + echo " --num-threads # Number of threads for each process (can't be usefully" + echo " # increased much above 4)" + echo " --stage # To control partial reruns" + echo " --num-gselect # Number of Gaussians to select using" + echo " # diagonal model." + exit 1; +fi + +set -euxo pipefail + +data=$1 +srcdir=$2 +dir=$3 + +for f in $srcdir/final.dubm $data/feats.scp; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +nj_full=$[$nj*$num_processes] +sdata=$data/split$nj_full; +utils/split_data.sh $data $nj_full + +cp $srcdir/final.dubm $dir + +## Set up features. +gmm_feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |" +feats="$gmm_feats" + +# Initialize the i-vector extractor using the input GMM, which is converted to +# full because that's what the i-vector extractor expects. Note: we have to do +# --use-weights=false to disable regression of the log weights on the ivector, +# because that would make the online estimation of the ivector difficult (since +# the online/real-time ivector estimation is the whole point of this script). +if [ $stage -le -2 ]; then + $cmd $dir/log/init.log \ + ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=false \ + "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie +fi + +# Do Gaussian selection and posterior extracion + +# if we subsample frame, modify the posterior-scale; this is likely +# to make the original posterior-scale (before subsampling) suitable. +modified_posterior_scale=$(perl -e "print $posterior_scale * $subsample;"); + +if [ $stage -le -1 ]; then + echo $nj_full > $dir/num_jobs + echo "$0: doing Gaussian selection and posterior computation" + $cmd JOB=1:$nj_full $dir/log/post.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$gmm_feats" ark:- \| \ + scale-post ark:- $modified_posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" +else + # make sure we at least have the right number of post.*.gz files. + if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then + echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)" + exit 1 + fi +fi + +x=0 +while [ $x -lt $num_iters ]; do + if [ $stage -le $x ]; then + rm $dir/.error 2>/dev/null || true + + Args=() # bash array of training commands for 1:nj, that put accs to stdout. + for j in $(seq $nj_full); do + Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g` + done + + echo "Accumulating stats (pass $x)" + for g in $(seq $nj); do + start=$[$num_processes*($g-1)+1] + $cmd --num-threads $[$num_threads*$num_processes] $dir/log/acc.$x.$g.log \ + ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \ + $dir/acc.$x.$g || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1; + + accs="" + for j in $(seq $nj); do + accs+="$dir/acc.$x.$j " + done + echo "Summing accs (pass $x)" + $cmd $dir/log/sum_acc.$x.log \ + ivector-extractor-sum-accs $accs $dir/acc.$x + + echo "Updating model (pass $x)" + nt=$[$num_threads*$num_processes] # use the same number of threads that + # each accumulation process uses, since we + # can be sure the queue will support this many. + # + # The parallel-opts was either specified by + # the user or we computed it correctly in + # tge previous stages + $cmd --num-threads $[$num_threads*$num_processes] $dir/log/update.$x.log \ + ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie + rm $dir/acc.$x.* + + if $cleanup; then + rm $dir/acc.$x + # rm $dir/$x.ie + fi + fi + x=$[$x+1] +done + +rm $dir/final.ie 2>/dev/null || true +ln -s $x.ie $dir/final.ie diff --git a/egs/wsj/s5/steps/nnet/train_mpe.sh b/egs/wsj/s5/steps/nnet/train_mpe.sh index d6203a7da60..1d2a6256ea8 100755 --- a/egs/wsj/s5/steps/nnet/train_mpe.sh +++ b/egs/wsj/s5/steps/nnet/train_mpe.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2013-2015 Brno University of Technology (author: Karel Vesely) +# Copyright 2013-2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0. # Sequence-discriminative MPE/sMBR training of DNN. @@ -136,7 +136,7 @@ feats="ark,o:copy-feats scp:$dir/train.scp ark:- |" # add-ivector (optional), if [ -e $D/ivector_dim ]; then [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1 - # Get the tool, + # Get the tool, ivector_append_tool=append-vector-to-feats # default, [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) # Check dims, @@ -160,7 +160,7 @@ fi ### ### Prepare the alignments -### +### # Assuming all alignments will fit into memory ali="ark:gunzip -c $alidir/ali.*.gz |" @@ -202,7 +202,7 @@ while [ $x -le $num_iters ]; do x=$((x+1)) learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}") - + done (cd $dir; [ -e final.nnet ] && unlink final.nnet; ln -s $((x-1)).nnet final.nnet) diff --git a/egs/wsj/s5/steps/nnet/train_scheduler.sh b/egs/wsj/s5/steps/nnet/train_scheduler.sh index dbd9f7123eb..e2499b17274 100755 --- a/egs/wsj/s5/steps/nnet/train_scheduler.sh +++ b/egs/wsj/s5/steps/nnet/train_scheduler.sh @@ -22,7 +22,7 @@ feature_transform= max_iters=20 min_iters=0 # keep training, disable weight rejection, start learn-rate halving as usual, keep_lr_iters=0 # fix learning rate for N initial epochs, disable weight rejection, -dropout_iters= # Disable dropout after 'N' initial epochs, +dropout_schedule= # dropout-rates for N initial epochs, for example: 0.1,0.1,0.1,0.1,0.1,0.0 start_halving_impr=0.01 end_halving_impr=0.001 halving_factor=0.5 @@ -31,11 +31,11 @@ halving_factor=0.5 verbose=1 frame_weights= utt_weights= - + # End configuration. echo "$0 $@" # Print the command line for logging -[ -f path.sh ] && . ./path.sh; +[ -f path.sh ] && . ./path.sh; . parse_options.sh || exit 1; @@ -60,6 +60,8 @@ dir=$6 [ ! -d $dir/log ] && mkdir $dir/log [ ! -d $dir/nnet ] && mkdir $dir/nnet +dropout_array=($(echo ${dropout_schedule} | tr ',' ' ')) + # Skip training [ -e $dir/final.nnet ] && echo "'$dir/final.nnet' exists, skipping training" && exit 0 @@ -99,10 +101,11 @@ for iter in $(seq -w $max_iters); do # skip iteration (epoch) if already done, [ -e $dir/.done_iter$iter ] && echo -n "skipping... " && ls $mlp_next* && continue - # disable dropout? - if [ -n $dropout_iters -a $((dropout_iters+1)) -eq $iter ]; then - nnet-copy --dropout-retention=1.0 $mlp_best ${mlp_best}.no_dropout - mlp_best=${mlp_best}.no_dropout + # set dropout-rate from the schedule, + if [ -n ${dropout_array[$((${iter#0}-1))]-''} ]; then + dropout_rate=${dropout_array[$((${iter#0}-1))]} + nnet-copy --dropout-rate=$dropout_rate $mlp_best ${mlp_best}.dropout_rate${dropout_rate} + mlp_best=${mlp_best}.dropout_rate${dropout_rate} fi # training, @@ -114,11 +117,11 @@ for iter in $(seq -w $max_iters); do ${frame_weights:+ "--frame-weights=$frame_weights"} \ ${utt_weights:+ "--utt-weights=$utt_weights"} \ "$feats_tr" "$labels_tr" $mlp_best $mlp_next \ - 2>> $log || exit 1; + 2>> $log || exit 1; tr_loss=$(cat $dir/log/iter${iter}.tr.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }') echo -n "TRAIN AVG.LOSS $(printf "%.4f" $tr_loss), (lrate$(printf "%.6g" $learn_rate)), " - + # cross-validation, log=$dir/log/iter${iter}.cv.log; hostname>$log $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \ @@ -127,7 +130,7 @@ for iter in $(seq -w $max_iters); do ${utt_weights:+ "--utt-weights=$utt_weights"} \ "$feats_cv" "$labels_cv" $mlp_next \ 2>>$log || exit 1; - + loss_new=$(cat $dir/log/iter${iter}.cv.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }') echo -n "CROSSVAL AVG.LOSS $(printf "%.4f" $loss_new), " @@ -141,7 +144,7 @@ for iter in $(seq -w $max_iters); do [ $iter -le $keep_lr_iters ] && mlp_best=${mlp_best}_keep-lr-iters-$keep_lr_iters mv $mlp_next $mlp_best echo "nnet accepted ($(basename $mlp_best))" - echo $mlp_best > $dir/.mlp_best + echo $mlp_best > $dir/.mlp_best else # rejecting, mlp_reject=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)_rejected @@ -151,9 +154,9 @@ for iter in $(seq -w $max_iters); do # create .done file, the iteration (epoch) is completed, touch $dir/.done_iter$iter - + # continue with original learn-rate, - [ $iter -le $keep_lr_iters ] && continue + [ $iter -le $keep_lr_iters ] && continue # stopping criterion, rel_impr=$(bc <<< "scale=10; ($loss_prev-$loss)/$loss_prev") @@ -171,7 +174,7 @@ for iter in $(seq -w $max_iters); do halving=1 echo $halving >$dir/.halving fi - + # reduce the learning-rate, if [ 1 == $halving ]; then learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}") @@ -180,7 +183,7 @@ for iter in $(seq -w $max_iters); do done # select the best network, -if [ $mlp_best != $mlp_init ]; then +if [ $mlp_best != $mlp_init ]; then mlp_final=${mlp_best}_final_ ( cd $dir/nnet; ln -s $(basename $mlp_best) $(basename $mlp_final); ) ( cd $dir; ln -s nnet/$(basename $mlp_final) final.nnet; ) diff --git a/egs/wsj/s5/utils/nnet/make_nnet_proto.py b/egs/wsj/s5/utils/nnet/make_nnet_proto.py index 873c5107822..7b5c50beeb8 100755 --- a/egs/wsj/s5/utils/nnet/make_nnet_proto.py +++ b/egs/wsj/s5/utils/nnet/make_nnet_proto.py @@ -72,10 +72,12 @@ parser.add_option('--bottleneck-dim', dest='bottleneck_dim', help='Make bottleneck network with desired bn-dim (0 = no bottleneck) [default: %default]', default=0, type='int'); -parser.add_option('--dropout-retention', dest='dropout_retention', - help='Put dropout after the non-linearity of hidden layerm (0.0 = disabled) [default: %default]', - default=0.0, type='float'); - +parser.add_option('--with-dropout', dest='with_dropout', + help='Add after the non-linearity of hidden layer.', + action='store_true', default=False); +parser.add_option('--dropout-opts', dest='dropout_opts', + help='Extra options for dropout [default: %default]', + default='', type='string'); (o,args) = parser.parse_args() @@ -86,6 +88,7 @@ # A HACK TO PASS MULTI-WORD OPTIONS, WORDS ARE CONNECTED BY UNDERSCORES '_', o.activation_opts = o.activation_opts.replace("_"," ") o.affine_opts = o.affine_opts.replace("_"," ") +o.dropout_opts = o.dropout_opts.replace("_"," ") (feat_dim, num_leaves, num_hid_layers, num_hid_neurons) = map(int,args); ### End parse options @@ -179,8 +182,8 @@ def Glorot(dim1, dim2): # This is done by multiplying with stddev(U[0,1]) = sqrt(1/12). # The stddev of weights is consequently reduced with scale 0.29, print "%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts) -if o.dropout_retention > 0.0: - print " %d %d %f" % (num_hid_neurons, num_hid_neurons, o.dropout_retention) +if o.with_dropout: + print " %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts) # Internal AffineTransforms, @@ -189,8 +192,8 @@ def Glorot(dim1, dim2): (num_hid_neurons, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \ (o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm, o.affine_opts) print "%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts) - if o.dropout_retention > 0.0: - print " %d %d %f" % (num_hid_neurons, num_hid_neurons, o.dropout_retention) + if o.with_dropout: + print " %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts) # Optionaly add bottleneck, if o.bottleneck_dim != 0: @@ -213,8 +216,8 @@ def Glorot(dim1, dim2): (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \ (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm, o.affine_opts) print "%s %d %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts) - if o.dropout_retention > 0.0: - print " %d %d %f" % (num_hid_neurons, num_hid_neurons, o.dropout_retention) + if o.with_dropout: + print " %d %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts) # Last AffineTransform (10x smaller learning rate on bias) print " %d %d %f %f %f %f %f" % \ diff --git a/src/ivectorbin/ivector-normalize-length.cc b/src/ivectorbin/ivector-normalize-length.cc index d27a92bd01b..b1a7d665e2a 100644 --- a/src/ivectorbin/ivector-normalize-length.cc +++ b/src/ivectorbin/ivector-normalize-length.cc @@ -36,15 +36,19 @@ int main(int argc, char *argv[]) { "\n" "e.g.: \n" " ivector-normalize-length ark:ivectors.ark ark:normalized_ivectors.ark\n"; - + ParseOptions po(usage); bool normalize = true; po.Register("normalize", &normalize, "Set this to false to disable normalization"); - + + bool scaleup = true; + po.Register("scaleup", &scaleup, + "If 'true', the normalized iVector is scaled-up by 'sqrt(dim)'"); + po.Read(argc, argv); - + if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); @@ -55,13 +59,13 @@ int main(int argc, char *argv[]) { int32 num_done = 0; - + double tot_ratio = 0.0, tot_ratio2 = 0.0; SequentialBaseFloatVectorReader ivector_reader(ivector_rspecifier); BaseFloatVectorWriter ivector_writer(ivector_wspecifier); - + for (; !ivector_reader.Done(); ivector_reader.Next()) { std::string key = ivector_reader.Key(); Vector ivector = ivector_reader.Value(); @@ -69,7 +73,8 @@ int main(int argc, char *argv[]) { BaseFloat ratio = norm / sqrt(ivector.Dim()); // how much larger it is // than it would be, in // expectation, if normally - // distributed. + if (!scaleup) ratio = norm; + KALDI_VLOG(2) << "Ratio for key " << key << " is " << ratio; if (ratio == 0.0) { KALDI_WARN << "Zero iVector"; @@ -88,7 +93,7 @@ int main(int argc, char *argv[]) { ratio_stddev = sqrt(tot_ratio2 / num_done - avg_ratio * avg_ratio); KALDI_LOG << "Average ratio of iVector to expected length was " << avg_ratio << ", standard deviation was " << ratio_stddev; - } + } return (num_done != 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/nnet/nnet-activation.h b/src/nnet/nnet-activation.h index bf8a0ee5afc..74b0ebad650 100644 --- a/src/nnet/nnet-activation.h +++ b/src/nnet/nnet-activation.h @@ -1,6 +1,6 @@ // nnet/nnet-activation.h -// Copyright 2011-2013 Brno University of Technology (author: Karel Vesely) +// Copyright 2011-2016 Brno University of Technology (author: Karel Vesely) // See ../../COPYING for clarification regarding multiple authors // @@ -23,6 +23,7 @@ #include #include +#include #include "nnet/nnet-component.h" #include "nnet/nnet-utils.h" @@ -269,7 +270,7 @@ class Dropout : public Component { public: Dropout(int32 dim_in, int32 dim_out): Component(dim_in, dim_out), - dropout_retention_(0.5) + dropout_rate_(0.5) { } ~Dropout() @@ -284,36 +285,58 @@ class Dropout : public Component { std::string token; while (is >> std::ws, !is.eof()) { ReadToken(is, false, &token); - /**/ if (token == "") ReadBasicType(is, false, &dropout_retention_); + /**/ if (token == "") ReadBasicType(is, false, &dropout_rate_); else KALDI_ERR << "Unknown token " << token << ", a typo in config?" - << " (DropoutRetention)"; + << " (DropoutRate)"; } - KALDI_ASSERT(dropout_retention_ > 0.0 && dropout_retention_ <= 1.0); + KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0); } void ReadData(std::istream &is, bool binary) { - if ('<' == Peek(is, binary)) { - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &dropout_retention_); + // Read all the '' in arbitrary order, + bool finished = false; + while ('<' == Peek(is, binary) && !finished) { + std::string token; + int first_char = PeekToken(is, binary); + switch (first_char) { + case 'D': ReadToken(is, false, &token); + /**/ if (token == "") ReadBasicType(is, binary, &dropout_rate_); + else if (token == "") { /* compatibility */ + BaseFloat dropout_retention; + ReadBasicType(is, binary, &dropout_retention); + dropout_rate_ = 1.0 - dropout_retention; + } else KALDI_ERR << "Unknown token: " << token; + break; + case '!': ExpectToken(is, binary, ""); + finished = true; + break; + default: ReadToken(is, false, &token); + KALDI_ERR << "Unknown token: " << token; + } } - KALDI_ASSERT(dropout_retention_ > 0.0 && dropout_retention_ <= 1.0); + KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0); } void WriteData(std::ostream &os, bool binary) const { - WriteToken(os, binary, ""); - WriteBasicType(os, binary, dropout_retention_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dropout_rate_); + } + + std::string Info() const { + return std::string(" ") + ToString(dropout_rate_); } void PropagateFnc(const CuMatrixBase &in, CuMatrixBase *out) { out->CopyFromMat(in); - // switch off 50% of the inputs... + // set N inputs to zero, according to the 'dropout_rate_' ... dropout_mask_.Resize(out->NumRows(), out->NumCols()); - dropout_mask_.Set(dropout_retention_); - rand_.BinarizeProbs(dropout_mask_, &dropout_mask_); + rand_.RandUniform(&dropout_mask_); // [0..1] + dropout_mask_.Add(-dropout_rate_); // [(-rate)..(1-rate)] + dropout_mask_.Heaviside(dropout_mask_); // (x > 0.0 ? 1 : 0) out->MulElements(dropout_mask_); - // rescale to keep same dynamic range as w/o dropout - out->Scale(1.0/dropout_retention_); + // rescale to keep the same dynamic range as w/o dropout, + out->Scale(1.0 / (1.0 - dropout_rate_)); } void BackpropagateFnc(const CuMatrixBase &in, @@ -323,21 +346,24 @@ class Dropout : public Component { in_diff->CopyFromMat(out_diff); // use same mask on the error derivatives... in_diff->MulElements(dropout_mask_); - // enlarge output to fit dynamic range w/o dropout - in_diff->Scale(1.0/dropout_retention_); + // enlarge the output to fit same dynamic range as w/o dropout + in_diff->Scale(1.0 / (1.0 - dropout_rate_)); } - BaseFloat GetDropoutRetention() { return dropout_retention_; } + BaseFloat GetDropoutRate() { return dropout_rate_; } - void SetDropoutRetention(BaseFloat dr) { - dropout_retention_ = dr; - KALDI_ASSERT(dropout_retention_ > 0.0 && dropout_retention_ <= 1.0); + void SetDropoutRate(BaseFloat dr) { + dropout_rate_ = dr; + KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0); } private: - CuRand rand_; - CuMatrix dropout_mask_; - BaseFloat dropout_retention_; + BaseFloat dropout_rate_; ///< probability that a neuron is dropped, + + CuRand rand_; ///< generator of random numbers, + + CuMatrix dropout_mask_; // random binary mask, + // 1 = keep neuron, 0 = drop neuron, }; } // namespace nnet1 diff --git a/src/nnet/nnet-nnet.cc b/src/nnet/nnet-nnet.cc index 4b0b565b94c..86c5f9e5ad0 100644 --- a/src/nnet/nnet-nnet.cc +++ b/src/nnet/nnet-nnet.cc @@ -265,13 +265,13 @@ void Nnet::SetParams(const VectorBase& params) { KALDI_ASSERT(pos == NumParams()); } -void Nnet::SetDropoutRetention(BaseFloat r) { +void Nnet::SetDropoutRate(BaseFloat r) { for (int32 c = 0; c < NumComponents(); c++) { if (GetComponent(c).GetType() == Component::kDropout) { Dropout& comp = dynamic_cast(GetComponent(c)); - BaseFloat r_old = comp.GetDropoutRetention(); - comp.SetDropoutRetention(r); - KALDI_LOG << "Setting dropout-retention in component " << c + BaseFloat r_old = comp.GetDropoutRate(); + comp.SetDropoutRate(r); + KALDI_LOG << "Setting dropout-rate in component " << c << " from " << r_old << " to " << r; } } diff --git a/src/nnet/nnet-nnet.h b/src/nnet/nnet-nnet.h index a06cf072f5f..cf29f91a89d 100644 --- a/src/nnet/nnet-nnet.h +++ b/src/nnet/nnet-nnet.h @@ -123,7 +123,7 @@ class Nnet { void SetParams(const VectorBase& params); /// Set the dropout rate - void SetDropoutRetention(BaseFloat r); + void SetDropoutRate(BaseFloat r); /// Reset streams in multi-stream training, void ResetStreams(const std::vector &stream_reset_flag); diff --git a/src/nnetbin/nnet-copy.cc b/src/nnetbin/nnet-copy.cc index 2567001beb3..c4a27f2dd69 100644 --- a/src/nnetbin/nnet-copy.cc +++ b/src/nnetbin/nnet-copy.cc @@ -37,7 +37,7 @@ int main(int argc, char *argv[]) { bool binary_write = true; int32 remove_first_components = 0; int32 remove_last_components = 0; - BaseFloat dropout_retention = 0.0; + BaseFloat dropout_rate = -1.0; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); @@ -52,8 +52,9 @@ int main(int argc, char *argv[]) { po.Register("remove-last-components", &remove_last_components, "Remove N last layers Components from the Nnet"); - po.Register("dropout-retention", &dropout_retention, - "Set dropout retention to a particular value."); + po.Register("dropout-rate", &dropout_rate, + "Probability that neuron is dropped" + "(-1.0 keeps original value)."); std::string from_parallel_component; po.Register("from-parallel-component", &from_parallel_component, @@ -131,8 +132,8 @@ int main(int argc, char *argv[]) { } // dropout, - if (dropout_retention != 0.0) { - nnet.SetDropoutRetention(dropout_retention); + if (dropout_rate != -1.0) { + nnet.SetDropoutRate(dropout_rate); } // store the network, diff --git a/src/nnetbin/nnet-forward.cc b/src/nnetbin/nnet-forward.cc index 1a40d03cdf7..062bca7da9d 100644 --- a/src/nnetbin/nnet-forward.cc +++ b/src/nnetbin/nnet-forward.cc @@ -109,8 +109,8 @@ int main(int argc, char *argv[]) { PdfPrior pdf_prior(prior_opts); // disable dropout, - nnet_transf.SetDropoutRetention(1.0); - nnet.SetDropoutRetention(1.0); + nnet_transf.SetDropoutRate(0.0); + nnet.SetDropoutRate(0.0); kaldi::int64 tot_t = 0; diff --git a/src/nnetbin/nnet-train-frmshuff.cc b/src/nnetbin/nnet-train-frmshuff.cc index 58e50074492..07cfb626d9f 100644 --- a/src/nnetbin/nnet-train-frmshuff.cc +++ b/src/nnetbin/nnet-train-frmshuff.cc @@ -82,11 +82,6 @@ int main(int argc, char *argv[]) { po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); - double dropout_retention = 0.0; - po.Register("dropout-retention", &dropout_retention, - "number between 0..1, controls how many neurons are preserved " - "(0.0 will keep the value unchanged)"); - po.Read(argc, argv); if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) { @@ -120,13 +115,9 @@ int main(int argc, char *argv[]) { nnet.Read(model_filename); nnet.SetTrainOptions(trn_opts); - if (dropout_retention > 0.0) { - nnet_transf.SetDropoutRetention(dropout_retention); - nnet.SetDropoutRetention(dropout_retention); - } if (crossvalidate) { - nnet_transf.SetDropoutRetention(1.0); - nnet.SetDropoutRetention(1.0); + nnet_transf.SetDropoutRate(0.0); + nnet.SetDropoutRate(0.0); } kaldi::int64 total_frames = 0; diff --git a/src/nnetbin/nnet-train-multistream-perutt.cc b/src/nnetbin/nnet-train-multistream-perutt.cc index 53a98b9b03f..154c7fd9c9d 100644 --- a/src/nnetbin/nnet-train-multistream-perutt.cc +++ b/src/nnetbin/nnet-train-multistream-perutt.cc @@ -118,6 +118,11 @@ int main(int argc, char *argv[]) { nnet.Read(model_filename); nnet.SetTrainOptions(trn_opts); + if (crossvalidate) { + nnet_transf.SetDropoutRate(0.0); + nnet.SetDropoutRate(0.0); + } + kaldi::int64 total_frames = 0; // Initialize feature and target readers, diff --git a/src/nnetbin/nnet-train-multistream.cc b/src/nnetbin/nnet-train-multistream.cc index bdc2d132d04..7424759f45b 100644 --- a/src/nnetbin/nnet-train-multistream.cc +++ b/src/nnetbin/nnet-train-multistream.cc @@ -196,6 +196,11 @@ int main(int argc, char *argv[]) { nnet.Read(model_filename); nnet.SetTrainOptions(trn_opts); + if (crossvalidate) { + nnet_transf.SetDropoutRate(0.0); + nnet.SetDropoutRate(0.0); + } + kaldi::int64 total_frames = 0; SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); diff --git a/src/nnetbin/nnet-train-perutt.cc b/src/nnetbin/nnet-train-perutt.cc index 04ee0b97bab..c8695ffa4ff 100644 --- a/src/nnetbin/nnet-train-perutt.cc +++ b/src/nnetbin/nnet-train-perutt.cc @@ -114,6 +114,11 @@ int main(int argc, char *argv[]) { nnet.Read(model_filename); nnet.SetTrainOptions(trn_opts); + if (crossvalidate) { + nnet_transf.SetDropoutRate(0.0); + nnet.SetDropoutRate(0.0); + } + kaldi::int64 total_frames = 0; SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);