diff --git a/egs/ami/s5/RESULTS_ihm b/egs/ami/s5/RESULTS_ihm
index 6435e9df47b..667c6362da3 100644
--- a/egs/ami/s5/RESULTS_ihm
+++ b/egs/ami/s5/RESULTS_ihm
@@ -23,30 +23,46 @@ exp/ihm/tri4a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.
 
 # Karel, JSALT 2015, (21.7.2015)
 
-# dev,
-## GMM,
+# GMM - dev,
 %WER 38.1 | 13098 94489 | 67.1 20.6 12.2 5.2 38.1 67.0 | exp/ihm/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.sys
 %WER 35.5 | 13098 94487 | 69.6 19.0 11.4 5.1 35.5 65.8 | exp/ihm/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.sys
 %WER 32.2 | 13098 94483 | 72.5 17.2 10.3 4.8 32.2 63.8 | exp/ihm/tri4a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_14/dev.ctm.filt.sys #0.1% worse than Pawel!
 %WER 30.2 | 13098 94479 | 74.0 15.6 10.4 4.2 30.2 61.9 | exp/ihm/tri4a_mmi_b0.1/decode_dev_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/dev.ctm.filt.sys
-## DNN-Xent,
-%WER 26.0 | 13098 94483 | 77.9 13.5 8.5 4.0 26.0 58.4 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_11/dev.ctm.filt.sys
-## DNN-sMBR,
-%WER 24.9 | 13098 94484 | 79.2 13.2 7.6 4.1 24.9 57.1 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/dev.ctm.filt.sys
-%WER 24.3 | 13098 94481 | 79.6 12.6 7.8 3.9 24.3 56.3 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_12/dev.ctm.filt.sys
 
-# eval,
-## GMM,
+# GMM - eval,
 %WER 43.9 | 12643 89978 | 60.8 25.3 13.9 4.8 43.9 65.6 | exp/ihm/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys
 %WER 40.8 | 12643 89985 | 63.8 23.6 12.6 4.7 40.8 64.6 | exp/ihm/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys
 %WER 35.1 | 12643 89975 | 69.1 19.8 11.1 4.2 35.1 61.8 | exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys #0.1% worse than Pawel!
 %WER 31.7 | 12643 89986 | 72.1 18.0 9.9 3.8 31.7 59.4 | exp/ihm/tri4a_mmi_b0.1/decode_eval_4.mdl_ami_fsh.o3g.kn.pr1-7/ascore_11/eval.ctm.filt.sys
+
+# nnet1 DNN on 'fmllr' system, RBMs 6x2048 neurons (21.7.2015):
 ## DNN-Xent,
+%WER 26.0 | 13098 94483 | 77.9 13.5 8.5 4.0 26.0 58.4 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_11/dev.ctm.filt.sys
 %WER 27.1 | 12643 89971 | 76.4 15.5 8.1 3.5 27.1 57.2 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_10/eval.ctm.filt.sys
 ## DNN-sMBR,
-%WER 25.4 | 12643 89974 | 77.9 14.7 7.4 3.3 25.4 55.1 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/eval.ctm.filt.sys
+%WER 24.3 | 13098 94481 | 79.6 12.6 7.8 3.9 24.3 56.3 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_12/dev.ctm.filt.sys
 %WER 24.6 | 12643 89972 | 78.8 14.1 7.1 3.3 24.6 54.4 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_11/eval.ctm.filt.sys
 
+# nnet1 'fmllr + relu + dropout' system (27.9.2016):
+## DNN-Xent,
+%WER 26.3 | 13098 94489 | 77.6 13.6 8.7 3.9 26.3 59.2 | 0.005 | exp/ihm/dnn4d-6L1024-relu/decode_dev/ascore_11/dev.ctm.filt.sys
+%WER 27.2 | 12643 89970 | 75.9 15.0 9.1 3.1 27.2 57.4 | 0.053 | exp/ihm/dnn4d-6L1024-relu/decode_eval/ascore_11/eval.ctm.filt.sys
+# => about the same as the larger RBM system,
+## DNN-sMBR,
+%WER 25.1 | 13098 94474 | 78.6 12.7 8.6 3.7 25.1 57.6 | -0.193 | exp/ihm/dnn4d-6L1024-relu_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_15/dev.ctm.filt.sys
+%WER 25.2 | 12643 89977 | 77.7 14.1 8.2 2.9 25.2 55.1 | -0.138 | exp/ihm/dnn4d-6L1024-relu_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_15/eval.ctm.filt.sys
+# => on sMBR worse than the larger RBM system (tuning learning rate did not help),
+
+# nnet1 DNN on 'fmllr + kaldi i-vector per-spk' system (3.10.2016), RBMs 6x2048 neurons:
+## DNN-Xent,
+%WER 24.7 | 13098 94475 | 79.2 12.7 8.2 3.9 24.7 58.1 | -0.018 | exp/ihm/dnn4_pretrain-dbn-ivec_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_11/dev.ctm.filt.sys
+%WER 25.2 | 12643 89972 | 78.0 13.8 8.2 3.2 25.2 56.4 | 0.057 | exp/ihm/dnn4_pretrain-dbn-ivec_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_11/eval.ctm.filt.sys
+## DNN-sMBR,
+%WER 23.2 | 13098 94477 | 80.3 11.5 8.2 3.6 23.2 56.3 | 0.010 | exp/ihm/dnn4_pretrain-dbn-ivec_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_13/dev.ctm.filt.sys
+%WER 22.8 | 12643 89982 | 80.2 12.8 7.1 3.0 22.8 53.8 | 0.060 | exp/ihm/dnn4_pretrain-dbn-ivec_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_12/eval.ctm.filt.sys
+# => The kaldi i-vectors on fMLLR feats are helping nicely,
+# (WER not too far from the chain systems, even w/o data augmentation, but with fMLLR),
+
 
 # Vijay, TDNN results,
 for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done
diff --git a/egs/ami/s5/local/nnet/prepare_ivectors.sh b/egs/ami/s5/local/nnet/prepare_ivectors.sh
new file mode 100755
index 00000000000..0bee63ec308
--- /dev/null
+++ b/egs/ami/s5/local/nnet/prepare_ivectors.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright 2016, Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0
+
+. path.sh
+. cmd.sh
+
+train=data_ihm-fmllr-tri4/ihm/train
+dev=data_ihm-fmllr-tri4/ihm/dev
+eval=data_ihm-fmllr-tri4/ihm/eval
+gmm=exp/ihm/tri4a
+
+# Output directory for models and i-vectors,
+ivec_absdir=$(readlink -m data_ihm-fmllr-tri4/ihm/ivector)
+
+. utils/parse_options.sh
+
+set -euxo pipefail
+
+# UBM training (1024 components),
+ubm=$ivec_absdir/ubm
+steps/nnet/ivector/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 \
+  $train 1024 $ubm
+
+# Training i-vector extractor (100 dims),
+iextractor=$ivec_absdir/iextractor
+steps/nnet/ivector/train_ivector_extractor.sh --cmd "$train_cmd --mem 5G" --nj 10 \
+  --ivector-dim 100 $train $ubm $iextractor
+
+# Extracting the i-vectors (per speaker, as the per-utterance copies),
+steps/nnet/ivector/extract_ivectors.sh --cmd "$train_cmd" --nj 80 \
+  $train data/lang $iextractor \
+  ${gmm}_ali $ivec_absdir/ivec_train
+steps/nnet/ivector/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \
+  $dev data/lang $iextractor \
+  $gmm/decode_dev_ami_fsh.o3g.kn.pr1-7 $ivec_absdir/ivec_dev
+steps/nnet/ivector/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \
+  $eval data/lang $iextractor \
+  $gmm/decode_eval_ami_fsh.o3g.kn.pr1-7 $ivec_absdir/ivec_eval
+
+
+# POST-PROCESS PER-SPEAKER I-VECTORS:
+
+# Get the global mean of the i-vectors (train),
+ivector-mean scp:$ivec_absdir/ivec_train/ivectors_spk.scp $iextractor/global_mean
+
+# Merge the sets, normalize means, apply length normalization,
+cat $ivec_absdir/ivec_train/ivectors_spk-as-utt.scp \
+    $ivec_absdir/ivec_dev/ivectors_spk-as-utt.scp \
+    $ivec_absdir/ivec_eval/ivectors_spk-as-utt.scp | \
+  ivector-subtract-global-mean $iextractor/global_mean scp:- ark:- | \
+  ivector-normalize-length --scaleup=false ark:- ark,scp:$ivec_absdir/ivectors_spk-as-utt_normalized.ark,$ivec_absdir/ivectors_spk-as-utt_normalized.scp
+
+
+# POST-PROCESS PER-SENTENCE I-VECTORS:
+
+# Get the global mean of the i-vectors (train, per-sentence),
+ivector-mean scp:$ivec_absdir/ivec_train/ivectors_utt.scp $iextractor/global_mean_utt
+
+# Merge the sets, normalize means, apply length normalization,
+cat $ivec_absdir/ivec_train/ivectors_utt.scp \
+    $ivec_absdir/ivec_dev/ivectors_utt.scp \
+    $ivec_absdir/ivec_eval/ivectors_utt.scp | \
+  ivector-subtract-global-mean $iextractor/global_mean_utt scp:- ark:- | \
+  ivector-normalize-length --scaleup=false ark:- ark,scp:$ivec_absdir/ivectors_utt_normalized.ark,$ivec_absdir/ivectors_utt_normalized.scp
+
+
+exit 0 # Done!
diff --git a/egs/ami/s5/local/nnet/run_dnn_ivec.sh b/egs/ami/s5/local/nnet/run_dnn_ivec.sh
new file mode 100755
index 00000000000..fb2531f6473
--- /dev/null
+++ b/egs/ami/s5/local/nnet/run_dnn_ivec.sh
@@ -0,0 +1,121 @@
+#!/bin/bash -u
+
+. ./cmd.sh
+. ./path.sh
+
+# DNN training. This script is based on egs/swbd/s5b/local/run_dnn.sh
+# Shinji Watanabe, Karel Vesely,
+
+# Config:
+nj=80
+nj_decode=30
+stage=0 # resume training with --stage=N
+. utils/parse_options.sh || exit 1;
+#
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s [opts] <mic condition(ihm|sdm|mdm)>\n\n" `basename $0`
+  exit 1;
+fi
+mic=$1
+
+gmmdir=exp/$mic/tri4a
+data_fmllr=data_${mic}-fmllr-tri4
+
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+graph_dir=$gmmdir/graph_${LM}
+
+set -euxo pipefail
+
+# Store fMLLR features, so we can train on them easily,
+if [ $stage -le 0 ]; then
+  # eval
+  dir=$data_fmllr/$mic/eval
+  steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \
+     --transform-dir $gmmdir/decode_eval_${LM} \
+     $dir data/$mic/eval $gmmdir $dir/log $dir/data
+  # dev
+  dir=$data_fmllr/$mic/dev
+  steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \
+     --transform-dir $gmmdir/decode_dev_${LM} \
+     $dir data/$mic/dev $gmmdir $dir/log $dir/data
+  # train
+  dir=$data_fmllr/$mic/train
+  steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \
+     --transform-dir ${gmmdir}_ali \
+     $dir data/$mic/train $gmmdir $dir/log $dir/data
+  # split the data : 90% train 10% cross-validation (held-out)
+  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10
+fi
+
+# Prepare the i-vectors,
+if [ $stage -le 1 ]; then
+  local/nnet/prepare_ivectors.sh
+fi
+
+# Pre-train DBN, i.e. a stack of RBMs,
+ivector=scp:$data_fmllr/ihm/ivector/ivectors_spk-as-utt_normalized.scp
+if [ $stage -le 2 ]; then
+  dir=exp/$mic/dnn4_pretrain-dbn-ivec
+  $cuda_cmd $dir/log/pretrain_dbn.log \
+    steps/nnet/pretrain_dbn.sh --rbm-iter 1 --ivector $ivector \
+    $data_fmllr/$mic/train $dir
+fi
+
+# Train the DNN optimizing per-frame cross-entropy,
+if [ $stage -le 3 ]; then
+  dir=exp/$mic/dnn4_pretrain-dbn-ivec_dnn
+  ali=${gmmdir}_ali
+  feature_transform=exp/$mic/dnn4_pretrain-dbn-ivec/final.feature_transform
+  dbn=exp/$mic/dnn4_pretrain-dbn-ivec/6.dbn
+  # Train
+  $cuda_cmd $dir/log/train_nnet.log \
+    steps/nnet/train.sh --feature-transform $feature_transform --ivector $ivector \
+    --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+    $data_fmllr/$mic/train_tr90 $data_fmllr/$mic/train_cv10 data/lang $ali $ali $dir
+  # Decode (reuse HCLG graph)
+  steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf --acwt 0.1 \
+    --num-threads 3 --ivector $ivector \
+    $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM}
+  steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf --acwt 0.1 \
+    --num-threads 3 --ivector $ivector \
+    $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM}
+fi
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with
+# per-utterance updates. We use usually good acwt 0.1.
+# Lattices are not regenerated (it is faster).
+
+dir=exp/$mic/dnn4_pretrain-dbn-ivec_dnn_smbr
+srcdir=exp/$mic/dnn4_pretrain-dbn-ivec_dnn
+acwt=0.1
+
+# Generate lattices and alignments,
+if [ $stage -le 4 ]; then
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" --ivector $ivector \
+    $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --ivector $ivector \
+    --config conf/decode_dnn.conf --acwt $acwt \
+    $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_denlats
+fi
+
+# Re-train the DNN by 4 epochs of sMBR,
+if [ $stage -le 5 ]; then
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
+    --ivector $ivector \
+    $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
+  # Decode (reuse HCLG graph)
+  for ITER in 4 1; do
+    steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \
+      --nnet $dir/${ITER}.nnet --acwt $acwt --ivector $ivector \
+      $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM}_it${ITER}
+    steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \
+      --nnet $dir/${ITER}.nnet --acwt $acwt --ivector $ivector \
+      $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM}_it${ITER}
+  done
+fi
+
+# Getting results [see RESULTS file]
+# for x in exp/$mic/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+
diff --git a/egs/ami/s5/local/nnet/run_relu.sh b/egs/ami/s5/local/nnet/run_relu.sh
new file mode 100755
index 00000000000..1ee9fe3a0b6
--- /dev/null
+++ b/egs/ami/s5/local/nnet/run_relu.sh
@@ -0,0 +1,119 @@
+#!/bin/bash -u
+
+. ./cmd.sh
+. ./path.sh
+
+# DNN training. This script is based on egs/swbd/s5b/local/run_dnn.sh
+# Shinji Watanabe, Karel Vesely,
+
+# Config:
+nj=80
+nj_decode=30
+stage=0 # resume training with --stage=N
+. utils/parse_options.sh || exit 1;
+#
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s [opts] <mic condition(ihm|sdm|mdm)>\n\n" `basename $0`
+  exit 1;
+fi
+mic=$1
+
+gmmdir=exp/$mic/tri4a
+data_fmllr=data_${mic}-fmllr-tri4
+
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+graph_dir=$gmmdir/graph_${LM}
+
+set -euxo pipefail
+
+# Store fMLLR features, so we can train on them easily,
+if [ $stage -le 0 -a ! -e $data_fmllr/$mic/eval ]; then
+  # eval
+  dir=$data_fmllr/$mic/eval
+  steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \
+     --transform-dir $gmmdir/decode_eval_${LM} \
+     $dir data/$mic/eval $gmmdir $dir/log $dir/data
+  # dev
+  dir=$data_fmllr/$mic/dev
+  steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \
+     --transform-dir $gmmdir/decode_dev_${LM} \
+     $dir data/$mic/dev $gmmdir $dir/log $dir/data
+  # train
+  dir=$data_fmllr/$mic/train
+  steps/nnet/make_fmllr_feats.sh --nj 15 --cmd "$train_cmd" \
+     --transform-dir ${gmmdir}_ali \
+     $dir data/$mic/train $gmmdir $dir/log $dir/data
+  # split the data : 90% train 10% cross-validation (held-out)
+  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10
+fi
+
+train=data_ihm-fmllr-tri4/ihm/train
+dev=data_ihm-fmllr-tri4/ihm/dev
+eval=data_ihm-fmllr-tri4/ihm/eval
+
+lrate=0.00025
+param_std=0.02
+lr_alpha=1.0
+lr_beta=0.75
+dropout_schedule=0.2,0.2,0.2,0.2,0.2,0.0
+gmm=$gmmdir
+graph=$graph_dir
+
+# Train 6 layer DNN from random initialization,
+# - Parametric RELU, alphas+betas trained,
+# - Dropout retention 0.8 in 5 initial epochs with fixed learning rate,
+if [ $stage -le 1 ]; then
+  # Train the DNN optimizing per-frame cross-entropy.
+  dir=exp/$mic/dnn4d-6L1024-relu
+  ali=${gmm}_ali
+  # Train
+  $cuda_cmd $dir/log/train_nnet.log \
+    steps/nnet/train.sh --learn-rate $lrate \
+    --splice 5 --hid-layers 6 --hid-dim 1024 \
+    --proto-opts "--activation-type=<ParametricRelu> --activation-opts=<AlphaLearnRateCoef>_${lr_alpha}_<BetaLearnRateCoef>_${lr_beta} --param-stddev-factor $param_std --hid-bias-mean 0 --hid-bias-range 0 --with-dropout --no-glorot-scaled-stddev --no-smaller-input-weights" \
+    --scheduler-opts "--keep-lr-iters 5 --dropout-schedule $dropout_schedule" \
+    ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir
+  # Decode (reuse HCLG graph)
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --acwt 0.1 \
+    $graph $dev $dir/decode_$(basename $dev)
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --acwt 0.1 \
+    $graph $eval $dir/decode_$(basename $eval)
+fi
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with
+# per-utterance updates. We use usually good acwt 0.1.
+# Lattices are not regenerated (it is faster).
+
+dir=exp/$mic/dnn4d-6L1024-relu_smbr
+srcdir=exp/$mic/dnn4d-6L1024-relu
+acwt=0.1
+
+# Generate lattices and alignments,
+if [ $stage -le 3 ]; then
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
+    $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.conf \
+    --acwt $acwt $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_denlats
+fi
+
+# Re-train the DNN by 4 epochs of sMBR,
+if [ $stage -le 4 ]; then
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
+    --learn-rate 0.0000003 \
+    $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
+  # Decode (reuse HCLG graph)
+  for ITER in 4 1; do
+    steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM}_it${ITER}
+    steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM}_it${ITER}
+  done
+fi
+
+# Getting results [see RESULTS file]
+# for x in exp/$mic/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+
diff --git a/egs/wsj/s5/steps/nnet/ivector/extract_ivectors.sh b/egs/wsj/s5/steps/nnet/ivector/extract_ivectors.sh
new file mode 100755
index 00000000000..36af3ab49d8
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet/ivector/extract_ivectors.sh
@@ -0,0 +1,219 @@
+#!/bin/bash
+
+# Copyright     2013  Daniel Povey
+#               2016  Brno University of Technology (author: Karel Vesely)
+# Apache 2.0.
+
+
+# This script computes iVectors in the same format as extract_ivectors_online.sh,
+# except that they are actually not really computed online, they are first computed
+# per speaker and just duplicated many times.
+# This is mainly intended for use in decoding, where you want the best possible
+# quality of iVectors.
+#
+# This setup also makes it possible to use a previous decoding or alignment, to
+# down-weight silence in the stats (default is --silence-weight 0.0).
+#
+# This is for when you use the "online-decoding" setup in an offline task, and
+# you want the best possible results.
+
+
+# Begin configuration section.
+nj=30
+cmd="run.pl"
+stage=0
+num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
+min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
+
+posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
+                    # inter-frame correlations.  Making this small during iVector
+                    # extraction is equivalent to scaling up the prior, and will
+                    # will tend to produce smaller iVectors where data-counts are
+                    # small.  It's not so important that this match the value
+                    # used when training the iVector extractor, but more important
+                    # that this match the value used when you do real online decoding
+                    # with the neural nets trained with these iVectors.
+
+max_count=100       # Interpret this as a number of frames times posterior scale...
+                    # this config ensures that once the count exceeds this (i.e.
+                    # 1000 frames, or 10 seconds, by default), we start to scale
+                    # down the stats, accentuating the prior term.   This seems quite
+                    # important for some reason.
+
+silence_weight=0.0
+acwt=0.1  # used if input is a decode dir, to get best path from lattices.
+mdl=final  # change this if decode directory did not have ../final.mdl present.
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ] && [ $# != 5 ]; then
+  echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>|<weights-archive>] <ivector-dir>"
+  echo " e.g.: $0 data/test exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "                                                   # Ignored if <alignment-dir> or <decode-dir> supplied."
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
+  echo "                                                   # diagonal model."
+  echo "  --min-post <float;default=0.025>                 # Pruning threshold for posteriors"
+  echo "  --ivector-period <int;default=10>                # How often to extract an iVector (frames)"
+  echo "  --posterior-scale <float;default=0.1>            # Scale on posteriors in iVector extraction; "
+  echo "                                                   # affects strength of prior term."
+
+  exit 1;
+fi
+
+set -euxo pipefail
+
+if [ $# -eq 4 ]; then
+  data=$1
+  lang=$2
+  srcdir=$3
+  dir=$4
+else # 5 arguments
+  data=$1
+  lang=$2
+  srcdir=$3
+  ali_or_decode_dir=$4
+  dir=$5
+fi
+
+for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $lang/phones.txt; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+mkdir -p $dir/log
+silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+
+if [ ! -z "$ali_or_decode_dir" ]; then
+
+  if [ -f $ali_or_decode_dir/ali.1.gz ]; then
+    if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist."
+      exit 1;
+    fi
+    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
+
+    if [ $stage -le 0 ]; then
+      rm $dir/weights.*.gz 2>/dev/null || true
+
+      $cmd JOB=1:$nj_orig  $dir/log/ali_to_post.JOB.log \
+        gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \
+        ali-to-post ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \
+        post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
+
+      # put all the weights in one archive.
+      for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
+      rm $dir/weights.*.gz || exit 1;
+    fi
+
+  elif [ -f $ali_or_decode_dir/lat.1.gz ]; then
+    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
+    if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist."
+      exit 1;
+    fi
+
+
+    if [ $stage -le 0 ]; then
+      rm $dir/weights.*.gz 2>/dev/null || true
+
+      $cmd JOB=1:$nj_orig  $dir/log/lat_to_post.JOB.log \
+        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \
+        ali-to-post ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \
+        post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
+
+      # put all the weights in one archive.
+      for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
+      rm $dir/weights.*.gz || exit 1;
+    fi
+
+  elif [ -f $ali_or_decode_dir ] && gunzip -c $ali_or_decode_dir >/dev/null; then
+    cp $ali_or_decode_dir $dir/weights.gz || exit 1;
+
+  else
+    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir";
+    exit 1;
+  fi
+fi
+
+sdata=$data/split$nj;
+utils/split_data.sh $data $nj || exit 1;
+
+gmm_feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+feats="$gmm_feats"
+
+# (here originally was the sub-speaker hack),
+this_sdata=$sdata
+
+# Per-speaker i-vectors,
+if [ $stage -le 2 ]; then
+  if [ ! -z "$ali_or_decode_dir" ]; then
+    $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
+      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
+      weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
+      ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
+        --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \
+      $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark
+  else
+    $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
+      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
+      ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
+        --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \
+      $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark
+  fi
+fi
+
+# Per-utterance i-vectors,
+if [ $stage -le 3 ]; then
+  if [ ! -z "$ali_or_decode_dir" ]; then
+    $cmd JOB=1:$nj $dir/log/extract_ivectors_utt.JOB.log \
+      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
+      weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
+      ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true --max-count=$max_count \
+      $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_utt.JOB.ark
+  else
+    $cmd JOB=1:$nj $dir/log/extract_ivectors_utt.JOB.log \
+      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
+      ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true --max-count=$max_count \
+      $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_utt.JOB.ark
+  fi
+fi
+
+
+# get an utterance-level set of iVectors (just duplicate the speaker-level ones).
+# note: if $this_sdata is set $dir/split$nj, then these won't be real speakers, they'll
+# be "sub-speakers" (speakers split up into multiple utterances).
+if [ $stage -le 4 ]; then
+  for j in $(seq $nj); do
+    utils/apply_map.pl -f 2 $dir/ivectors_spk.${j}.ark <$this_sdata/$j/utt2spk >$dir/ivectors_spk-as-utt.${j}.ark
+  done
+fi
+
+ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3]
+echo  "$0: iVector dim is $ivector_dim"
+
+absdir=$(readlink -f $dir)
+
+if [ $stage -le 5 ]; then
+  echo "$0: merging iVectors across jobs"
+  copy-vector "ark:cat $dir/ivectors_spk.*.ark |" ark,scp:$absdir/ivectors_spk.ark,$dir/ivectors_spk.scp
+  rm $dir/ivectors_spk.*.ark
+  copy-vector "ark:cat $dir/ivectors_spk-as-utt.*.ark |" ark,scp:$absdir/ivectors_spk-as-utt.ark,$dir/ivectors_spk-as-utt.scp
+  rm $dir/ivectors_spk-as-utt.*.ark
+  copy-vector "ark:cat $dir/ivectors_utt.*.ark |" ark,scp:$absdir/ivectors_utt.ark,$dir/ivectors_utt.scp
+  rm $dir/ivectors_utt.*.ark
+fi
+
+echo "$0: done extracting iVectors (per-speaker, per-sentence) into '$dir'"
+
diff --git a/egs/wsj/s5/steps/nnet/ivector/train_diag_ubm.sh b/egs/wsj/s5/steps/nnet/ivector/train_diag_ubm.sh
new file mode 100755
index 00000000000..ebd36a9e8e4
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet/ivector/train_diag_ubm.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+# Copyright   2012  Johns Hopkins University (Author: Daniel Povey)
+#             2013  Daniel Povey
+#             2016  Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0.
+
+# This script trains a diagonal UBM that we'll use in online iVector estimation,
+# where the online-estimated iVector will be used as a secondary input to a deep
+# neural net for single-pass DNN-based decoding.
+
+# This script was modified from ../../sre08/v1/sid/train_diag_ubm.sh.
+# It trains a diagonal UBM on top of input features. We use the original features,
+# assuming they are already normalized (or transformed).
+
+# This script does not use the trained model from the source directory to
+# initialize the diagonal GMM; instead, we initialize the GMM using
+# gmm-global-init-from-feats, which sets the means to random data points and
+# then does some iterations of E-M in memory.  After the in-memory
+# initialization we train for a few iterations in parallel.
+# Note that there is a slight mismatch in that the source LDA+MLLT matrix
+# (final.mat) will have been estimated using standard CMVN, and we're using
+# online CMVN.  We don't think this will have much effect.
+
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+num_iters=4
+stage=-2
+num_gselect=30 # Number of Gaussian-selection indices to use while training
+               # the model.
+num_frames=500000 # number of frames to keep in memory for initialization
+num_iters_init=20
+initial_gauss_proportion=0.5 # Start with half the target number of Gaussians
+subsample=2 # subsample all features with this periodicity, in the main E-M phase.
+cleanup=true
+min_gaussian_weight=0.0001
+remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed.
+num_threads=8
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0  <data> <num-gauss> <output-dir>"
+  echo " e.g.: $0 data/train 1024 exp/diag_ubm"
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <num-jobs|4>                                # number of parallel jobs to run."
+  echo "  --num-iters <niter|20>                           # number of iterations of parallel "
+  echo "                                                   # training (default: $num_iters)"
+  echo "  --stage <stage|-2>                               # stage to do partial re-run from."
+  echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
+  echo "                                                   # limit computation to, for speed"
+  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
+  echo "                                                   # frames (a speedup)"
+  echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
+  echo "                                                   # for model initialization"
+  echo "  --num-iters-init <n|20>                          # Number of E-M iterations for model"
+  echo "                                                   # initialization"
+  echo " --initial-gauss-proportion <proportion|0.5>       # Proportion of Gaussians to start with"
+  echo "                                                   # in initialization phase (then split)"
+  echo " --num-threads <n|16>                              # number of threads to use in initialization"
+  echo "                                                   # phase (must match with parallel-opts option)"
+  echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
+  echo "                                                   # initialization (this relatively high"
+  echo "                                                   # value keeps counts fairly even)"
+  exit 1;
+fi
+
+set -euo pipefail
+
+data=$1
+num_gauss=$2
+dir=$3
+
+! [ $num_gauss -gt 0 ] && echo "Bad num-gauss $num_gauss" && exit 1;
+
+sdata=$data/split$nj
+mkdir -p $dir/log
+utils/split_data.sh $data $nj || exit 1;
+
+for f in $data/feats.scp; do
+   [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1
+done
+
+# Note: there is no point subsampling all_feats, because gmm-global-init-from-feats
+# effectively does subsampling itself (it keeps a random subset of the features).
+all_feats="ark,s,cs:copy-feats scp:$data/feats.scp ark:- |"
+feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+
+num_gauss_init=$(perl -e "print int($initial_gauss_proportion * $num_gauss); ");
+! [ $num_gauss_init -gt 0 ] && echo "Invalid num-gauss-init $num_gauss_init" && exit 1;
+
+if [ $stage -le -2 ]; then
+  echo "$0: initializing model from E-M in memory, "
+  echo "$0: starting from $num_gauss_init Gaussians, reaching $num_gauss;"
+  echo "$0: for $num_iters_init iterations, using at most $num_frames frames of data"
+
+  $cmd --num-threads $num_threads $dir/log/gmm_init.log \
+    gmm-global-init-from-feats --num-threads=$num_threads --num-frames=$num_frames \
+     --min-gaussian-weight=$min_gaussian_weight \
+     --num-gauss=$num_gauss --num-gauss-init=$num_gauss_init --num-iters=$num_iters_init \
+    "$all_feats" $dir/0.dubm
+fi
+
+# Store Gaussian selection indices on disk-- this speeds up the training passes.
+if [ $stage -le -1 ]; then
+  echo "Getting Gaussian-selection info"
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    gmm-gselect --n=$num_gselect $dir/0.dubm "$feats" \
+      "ark:|gzip -c >$dir/gselect.JOB.gz"
+fi
+
+echo "$0: will train for $num_iters iterations, in parallel over"
+echo "$0: $nj machines, parallelized with '$cmd'"
+
+for x in $(seq 0 $[$num_iters-1]); do
+  echo "$0: Training pass $x"
+  if [ $stage -le $x ]; then
+  # Accumulate stats.
+    $cmd JOB=1:$nj $dir/log/acc.${x}.JOB.log \
+      gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
+      $dir/$x.dubm "$feats" $dir/$x.JOB.acc
+    if [ $x -lt $[$num_iters-1] ]; then # Don't remove low-count Gaussians till last iter,
+      opt="--remove-low-count-gaussians=false" # or gselect info won't be valid any more.
+    else
+      opt="--remove-low-count-gaussians=$remove_low_count_gaussians"
+    fi
+    $cmd $dir/log/update.${x}.log \
+      gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/${x}.dubm "gmm-global-sum-accs - $dir/${x}.*.acc|" \
+      $dir/$[$x+1].dubm
+    rm $dir/$x.*.acc $dir/$x.dubm
+  fi
+done
+
+rm $dir/gselect.*.gz
+mv $dir/$num_iters.dubm $dir/final.dubm
+
+exit 0 # Done!
+
diff --git a/egs/wsj/s5/steps/nnet/ivector/train_ivector_extractor.sh b/egs/wsj/s5/steps/nnet/ivector/train_ivector_extractor.sh
new file mode 100755
index 00000000000..252035a525f
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet/ivector/train_ivector_extractor.sh
@@ -0,0 +1,179 @@
+#!/bin/bash
+
+# Copyright   2013  Daniel Povey
+#             2016  Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0.
+
+# This script is modified from ^/egs/sre08/v1/sid/train_ivector_extractor.sh.
+# It trains an iVector extractor for use in DNN training.
+
+# This script trains the i-vector extractor.  Note: there are 3 separate levels
+# of parallelization: num_threads, num_processes, and num_jobs.  This may seem a
+# bit excessive.  It has to do with minimizing memory usage and disk I/O,
+# subject to various constraints.  The "num_threads" is how many threads a
+# program uses; the "num_processes" is the number of separate processes a single
+# job spawns, and then sums the accumulators in memory.  Our recommendation:
+#  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
+#    (because of needing to lock various global quantities, the program can't
+#    use many more than 4 threads with good CPU utilization).
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
+#    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
+#    that's busy with other people's jobs, it may be wise to set it to rather less
+#    than this maximum though, or your jobs won't get scheduled.  And if memory is
+#    tight you need to be careful; in our normal setup, each process uses about 5G.
+#  - Set num_jobs to as many of the jobs (each using $num_threads * $num_processes CPUs)
+#    your queue will let you run at one time, but don't go much more than 10 or 20, or
+#    summing the accumulators will possibly get slow.  If you have a lot of data, you
+#    may want more jobs, though.
+
+# Begin configuration section.
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
+        # run is nj * num_processes * num_threads, and the number of
+        # separate pieces of data is nj * num_processes.
+num_threads=4
+num_processes=2 # each job runs this many processes, each with --num-threads threads
+cmd="run.pl"
+stage=-4
+ivector_dim=100 # dimension of the extracted i-vector
+num_iters=10
+num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
+posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
+                    # inter-frame correlations.
+min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
+               # caution: you should use the same value in the online-estimation
+               # code.
+subsample=2  # This speeds up the training: training on every 2nd feature
+             # (configurable) Since the features are highly correlated across
+             # frames, we don't expect to lose too much from this.
+parallel_opts=  # ignored now.
+cleanup=true
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <data> <diagonal-ubm-dir> <extractor-dir>"
+  echo " e.g.: $0 data/train exp/nnet2_online/diag_ubm/ exp/nnet2_online/extractor"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-iters <#iters|10>                          # Number of iterations of E-M"
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
+  echo "                                                   # to summing accs in memory)"
+  echo "  --num-threads <n|4>                              # Number of threads for each process (can't be usefully"
+  echo "                                                   # increased much above 4)"
+  echo "  --stage <stage|-4>                               # To control partial reruns"
+  echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
+  echo "                                                   # diagonal model."
+  exit 1;
+fi
+
+set -euxo pipefail
+
+data=$1
+srcdir=$2
+dir=$3
+
+for f in $srcdir/final.dubm $data/feats.scp; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+nj_full=$[$nj*$num_processes]
+sdata=$data/split$nj_full;
+utils/split_data.sh $data $nj_full
+
+cp $srcdir/final.dubm $dir
+
+## Set up features.
+gmm_feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+feats="$gmm_feats"
+
+# Initialize the i-vector extractor using the input GMM, which is converted to
+# full because that's what the i-vector extractor expects.  Note: we have to do
+# --use-weights=false to disable regression of the log weights on the ivector,
+# because that would make the online estimation of the ivector difficult (since
+# the online/real-time ivector estimation is the whole point of this script).
+if [ $stage -le -2 ]; then
+  $cmd $dir/log/init.log \
+    ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=false \
+     "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie
+fi
+
+# Do Gaussian selection and posterior extracion
+
+# if we subsample frame, modify the posterior-scale; this is likely
+# to make the original posterior-scale (before subsampling) suitable.
+modified_posterior_scale=$(perl -e "print $posterior_scale * $subsample;");
+
+if [ $stage -le -1 ]; then
+  echo $nj_full > $dir/num_jobs
+  echo "$0: doing Gaussian selection and posterior computation"
+  $cmd JOB=1:$nj_full $dir/log/post.JOB.log \
+    gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$gmm_feats" ark:- \| \
+    scale-post ark:- $modified_posterior_scale "ark:|gzip -c >$dir/post.JOB.gz"
+else
+  # make sure we at least have the right number of post.*.gz files.
+  if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
+    echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
+    exit 1
+  fi
+fi
+
+x=0
+while [ $x -lt $num_iters ]; do
+  if [ $stage -le $x ]; then
+    rm $dir/.error 2>/dev/null || true
+
+    Args=() # bash array of training commands for 1:nj, that put accs to stdout.
+    for j in $(seq $nj_full); do
+      Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g`
+    done
+
+    echo "Accumulating stats (pass $x)"
+    for g in $(seq $nj); do
+      start=$[$num_processes*($g-1)+1]
+      $cmd --num-threads $[$num_threads*$num_processes] $dir/log/acc.$x.$g.log \
+        ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \
+          $dir/acc.$x.$g || touch $dir/.error &
+    done
+    wait
+    [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
+
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x
+
+    echo "Updating model (pass $x)"
+    nt=$[$num_threads*$num_processes] # use the same number of threads that
+                                      # each accumulation process uses, since we
+                                      # can be sure the queue will support this many.
+                                      #
+                                      # The parallel-opts was either specified by
+                                      # the user or we computed it correctly in
+                                      # tge previous stages
+    $cmd --num-threads $[$num_threads*$num_processes] $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie
+    rm $dir/acc.$x.*
+
+    if $cleanup; then
+      rm $dir/acc.$x
+      # rm $dir/$x.ie
+    fi
+  fi
+  x=$[$x+1]
+done
+
+rm $dir/final.ie 2>/dev/null || true
+ln -s $x.ie $dir/final.ie
diff --git a/egs/wsj/s5/steps/nnet/train_mpe.sh b/egs/wsj/s5/steps/nnet/train_mpe.sh
index d6203a7da60..1d2a6256ea8 100755
--- a/egs/wsj/s5/steps/nnet/train_mpe.sh
+++ b/egs/wsj/s5/steps/nnet/train_mpe.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2013-2015  Brno University of Technology (author: Karel Vesely)  
+# Copyright 2013-2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0.
 
 # Sequence-discriminative MPE/sMBR training of DNN.
@@ -136,7 +136,7 @@ feats="ark,o:copy-feats scp:$dir/train.scp ark:- |"
 # add-ivector (optional),
 if [ -e $D/ivector_dim ]; then
   [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
-  # Get the tool, 
+  # Get the tool,
   ivector_append_tool=append-vector-to-feats # default,
   [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
   # Check dims,
@@ -160,7 +160,7 @@ fi
 
 ###
 ### Prepare the alignments
-### 
+###
 # Assuming all alignments will fit into memory
 ali="ark:gunzip -c $alidir/ali.*.gz |"
 
@@ -202,7 +202,7 @@ while [ $x -le $num_iters ]; do
 
   x=$((x+1))
   learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
-  
+
 done
 
 (cd $dir; [ -e final.nnet ] && unlink final.nnet; ln -s $((x-1)).nnet final.nnet)
diff --git a/egs/wsj/s5/steps/nnet/train_scheduler.sh b/egs/wsj/s5/steps/nnet/train_scheduler.sh
index dbd9f7123eb..e2499b17274 100755
--- a/egs/wsj/s5/steps/nnet/train_scheduler.sh
+++ b/egs/wsj/s5/steps/nnet/train_scheduler.sh
@@ -22,7 +22,7 @@ feature_transform=
 max_iters=20
 min_iters=0 # keep training, disable weight rejection, start learn-rate halving as usual,
 keep_lr_iters=0 # fix learning rate for N initial epochs, disable weight rejection,
-dropout_iters= # Disable dropout after 'N' initial epochs,
+dropout_schedule= # dropout-rates for N initial epochs, for example: 0.1,0.1,0.1,0.1,0.1,0.0
 start_halving_impr=0.01
 end_halving_impr=0.001
 halving_factor=0.5
@@ -31,11 +31,11 @@ halving_factor=0.5
 verbose=1
 frame_weights=
 utt_weights=
- 
+
 # End configuration.
 
 echo "$0 $@"  # Print the command line for logging
-[ -f path.sh ] && . ./path.sh; 
+[ -f path.sh ] && . ./path.sh;
 
 . parse_options.sh || exit 1;
 
@@ -60,6 +60,8 @@ dir=$6
 [ ! -d $dir/log ] && mkdir $dir/log
 [ ! -d $dir/nnet ] && mkdir $dir/nnet
 
+dropout_array=($(echo ${dropout_schedule} | tr ',' ' '))
+
 # Skip training
 [ -e $dir/final.nnet ] && echo "'$dir/final.nnet' exists, skipping training" && exit 0
 
@@ -99,10 +101,11 @@ for iter in $(seq -w $max_iters); do
   # skip iteration (epoch) if already done,
   [ -e $dir/.done_iter$iter ] && echo -n "skipping... " && ls $mlp_next* && continue
 
-  # disable dropout?
-  if [ -n $dropout_iters -a $((dropout_iters+1)) -eq $iter ]; then
-    nnet-copy --dropout-retention=1.0 $mlp_best ${mlp_best}.no_dropout
-    mlp_best=${mlp_best}.no_dropout
+  # set dropout-rate from the schedule,
+  if [ -n ${dropout_array[$((${iter#0}-1))]-''} ]; then
+    dropout_rate=${dropout_array[$((${iter#0}-1))]}
+    nnet-copy --dropout-rate=$dropout_rate $mlp_best ${mlp_best}.dropout_rate${dropout_rate}
+    mlp_best=${mlp_best}.dropout_rate${dropout_rate}
   fi
 
   # training,
@@ -114,11 +117,11 @@ for iter in $(seq -w $max_iters); do
     ${frame_weights:+ "--frame-weights=$frame_weights"} \
     ${utt_weights:+ "--utt-weights=$utt_weights"} \
     "$feats_tr" "$labels_tr" $mlp_best $mlp_next \
-    2>> $log || exit 1; 
+    2>> $log || exit 1;
 
   tr_loss=$(cat $dir/log/iter${iter}.tr.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
   echo -n "TRAIN AVG.LOSS $(printf "%.4f" $tr_loss), (lrate$(printf "%.6g" $learn_rate)), "
-  
+
   # cross-validation,
   log=$dir/log/iter${iter}.cv.log; hostname>$log
   $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
@@ -127,7 +130,7 @@ for iter in $(seq -w $max_iters); do
     ${utt_weights:+ "--utt-weights=$utt_weights"} \
     "$feats_cv" "$labels_cv" $mlp_next \
     2>>$log || exit 1;
-  
+
   loss_new=$(cat $dir/log/iter${iter}.cv.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
   echo -n "CROSSVAL AVG.LOSS $(printf "%.4f" $loss_new), "
 
@@ -141,7 +144,7 @@ for iter in $(seq -w $max_iters); do
     [ $iter -le $keep_lr_iters ] && mlp_best=${mlp_best}_keep-lr-iters-$keep_lr_iters
     mv $mlp_next $mlp_best
     echo "nnet accepted ($(basename $mlp_best))"
-    echo $mlp_best > $dir/.mlp_best 
+    echo $mlp_best > $dir/.mlp_best
   else
     # rejecting,
     mlp_reject=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)_rejected
@@ -151,9 +154,9 @@ for iter in $(seq -w $max_iters); do
 
   # create .done file, the iteration (epoch) is completed,
   touch $dir/.done_iter$iter
-  
+
   # continue with original learn-rate,
-  [ $iter -le $keep_lr_iters ] && continue 
+  [ $iter -le $keep_lr_iters ] && continue
 
   # stopping criterion,
   rel_impr=$(bc <<< "scale=10; ($loss_prev-$loss)/$loss_prev")
@@ -171,7 +174,7 @@ for iter in $(seq -w $max_iters); do
     halving=1
     echo $halving >$dir/.halving
   fi
-  
+
   # reduce the learning-rate,
   if [ 1 == $halving ]; then
     learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
@@ -180,7 +183,7 @@ for iter in $(seq -w $max_iters); do
 done
 
 # select the best network,
-if [ $mlp_best != $mlp_init ]; then 
+if [ $mlp_best != $mlp_init ]; then
   mlp_final=${mlp_best}_final_
   ( cd $dir/nnet; ln -s $(basename $mlp_best) $(basename $mlp_final); )
   ( cd $dir; ln -s nnet/$(basename $mlp_final) final.nnet; )
diff --git a/egs/wsj/s5/utils/nnet/make_nnet_proto.py b/egs/wsj/s5/utils/nnet/make_nnet_proto.py
index 873c5107822..7b5c50beeb8 100755
--- a/egs/wsj/s5/utils/nnet/make_nnet_proto.py
+++ b/egs/wsj/s5/utils/nnet/make_nnet_proto.py
@@ -72,10 +72,12 @@
 parser.add_option('--bottleneck-dim', dest='bottleneck_dim',
                    help='Make bottleneck network with desired bn-dim (0 = no bottleneck) [default: %default]',
                    default=0, type='int');
-parser.add_option('--dropout-retention', dest='dropout_retention',
-                   help='Put dropout after the non-linearity of hidden layerm (0.0 = disabled) [default: %default]',
-                   default=0.0, type='float');
-
+parser.add_option('--with-dropout', dest='with_dropout',
+                   help='Add <Dropout> after the non-linearity of hidden layer.',
+                   action='store_true', default=False);
+parser.add_option('--dropout-opts', dest='dropout_opts',
+                   help='Extra options for dropout [default: %default]',
+                   default='', type='string');
 
 
 (o,args) = parser.parse_args()
@@ -86,6 +88,7 @@
 # A HACK TO PASS MULTI-WORD OPTIONS, WORDS ARE CONNECTED BY UNDERSCORES '_',
 o.activation_opts = o.activation_opts.replace("_"," ")
 o.affine_opts = o.affine_opts.replace("_"," ")
+o.dropout_opts = o.dropout_opts.replace("_"," ")
 
 (feat_dim, num_leaves, num_hid_layers, num_hid_neurons) = map(int,args);
 ### End parse options
@@ -179,8 +182,8 @@ def Glorot(dim1, dim2):
       # This is done by multiplying with stddev(U[0,1]) = sqrt(1/12).
       # The stddev of weights is consequently reduced with scale 0.29,
 print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)
-if o.dropout_retention > 0.0:
-  print "<Dropout> <InputDim> %d <OutputDim> %d <DropoutRetention> %f" % (num_hid_neurons, num_hid_neurons, o.dropout_retention)
+if o.with_dropout:
+  print "<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)
 
 
 # Internal AffineTransforms,
@@ -189,8 +192,8 @@ def Glorot(dim1, dim2):
         (num_hid_neurons, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
          (o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm, o.affine_opts)
   print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)
-  if o.dropout_retention > 0.0:
-    print "<Dropout> <InputDim> %d <OutputDim> %d <DropoutRetention> %f" % (num_hid_neurons, num_hid_neurons, o.dropout_retention)
+  if o.with_dropout:
+    print "<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)
 
 # Optionaly add bottleneck,
 if o.bottleneck_dim != 0:
@@ -213,8 +216,8 @@ def Glorot(dim1, dim2):
      (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
       (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm, o.affine_opts)
   print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)
-  if o.dropout_retention > 0.0:
-    print "<Dropout> <InputDim> %d <OutputDim> %d <DropoutRetention> %f" % (num_hid_neurons, num_hid_neurons, o.dropout_retention)
+  if o.with_dropout:
+    print "<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)
 
 # Last AffineTransform (10x smaller learning rate on bias)
 print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
diff --git a/src/ivectorbin/ivector-normalize-length.cc b/src/ivectorbin/ivector-normalize-length.cc
index d27a92bd01b..b1a7d665e2a 100644
--- a/src/ivectorbin/ivector-normalize-length.cc
+++ b/src/ivectorbin/ivector-normalize-length.cc
@@ -36,15 +36,19 @@ int main(int argc, char *argv[]) {
         "<ivector-wspecifier>\n"
         "e.g.: \n"
         " ivector-normalize-length ark:ivectors.ark ark:normalized_ivectors.ark\n";
-    
+
     ParseOptions po(usage);
     bool normalize = true;
 
     po.Register("normalize", &normalize,
                 "Set this to false to disable normalization");
-    
+
+    bool scaleup = true;
+    po.Register("scaleup", &scaleup,
+                "If 'true', the normalized iVector is scaled-up by 'sqrt(dim)'");
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -55,13 +59,13 @@ int main(int argc, char *argv[]) {
 
 
     int32 num_done = 0;
-    
+
     double tot_ratio = 0.0, tot_ratio2 = 0.0;
 
     SequentialBaseFloatVectorReader ivector_reader(ivector_rspecifier);
     BaseFloatVectorWriter ivector_writer(ivector_wspecifier);
 
-    
+
     for (; !ivector_reader.Done(); ivector_reader.Next()) {
       std::string key = ivector_reader.Key();
       Vector<BaseFloat> ivector = ivector_reader.Value();
@@ -69,7 +73,8 @@ int main(int argc, char *argv[]) {
       BaseFloat ratio = norm / sqrt(ivector.Dim()); // how much larger it is
                                                     // than it would be, in
                                                     // expectation, if normally
-                                                    // distributed.
+      if (!scaleup) ratio = norm;
+
       KALDI_VLOG(2) << "Ratio for key " << key << " is " << ratio;
       if (ratio == 0.0) {
         KALDI_WARN << "Zero iVector";
@@ -88,7 +93,7 @@ int main(int argc, char *argv[]) {
           ratio_stddev = sqrt(tot_ratio2 / num_done - avg_ratio * avg_ratio);
       KALDI_LOG << "Average ratio of iVector to expected length was "
                 << avg_ratio << ", standard deviation was " << ratio_stddev;
-    }      
+    }
     return (num_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/nnet/nnet-activation.h b/src/nnet/nnet-activation.h
index bf8a0ee5afc..74b0ebad650 100644
--- a/src/nnet/nnet-activation.h
+++ b/src/nnet/nnet-activation.h
@@ -1,6 +1,6 @@
 // nnet/nnet-activation.h
 
-// Copyright 2011-2013  Brno University of Technology (author: Karel Vesely)
+// Copyright 2011-2016  Brno University of Technology (author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,6 +23,7 @@
 
 #include <string>
 #include <vector>
+#include <cmath>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -269,7 +270,7 @@ class Dropout : public Component {
  public:
   Dropout(int32 dim_in, int32 dim_out):
       Component(dim_in, dim_out),
-      dropout_retention_(0.5)
+      dropout_rate_(0.5)
   { }
 
   ~Dropout()
@@ -284,36 +285,58 @@ class Dropout : public Component {
     std::string token;
     while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
-      /**/ if (token == "<DropoutRetention>") ReadBasicType(is, false, &dropout_retention_);
+      /**/ if (token == "<DropoutRate>") ReadBasicType(is, false, &dropout_rate_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (DropoutRetention)";
+                     << " (DropoutRate)";
     }
-    KALDI_ASSERT(dropout_retention_ > 0.0 && dropout_retention_ <= 1.0);
+    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
   }
 
   void ReadData(std::istream &is, bool binary) {
-    if ('<' == Peek(is, binary)) {
-      ExpectToken(is, binary, "<DropoutRetention>");
-      ReadBasicType(is, binary, &dropout_retention_);
+    // Read all the '<Tokens>' in arbitrary order,
+    bool finished = false;
+    while ('<' == Peek(is, binary) && !finished) {
+      std::string token;
+      int first_char = PeekToken(is, binary);
+      switch (first_char) {
+        case 'D': ReadToken(is, false, &token);
+          /**/ if (token == "<DropoutRate>") ReadBasicType(is, binary, &dropout_rate_);
+          else if (token == "<DropoutRetention>") { /* compatibility */
+            BaseFloat dropout_retention;
+            ReadBasicType(is, binary, &dropout_retention);
+            dropout_rate_ = 1.0 - dropout_retention;
+          } else KALDI_ERR << "Unknown token: " << token;
+          break;
+        case '!': ExpectToken(is, binary, "<!EndOfComponent>");
+          finished = true;
+          break;
+        default: ReadToken(is, false, &token);
+          KALDI_ERR << "Unknown token: " << token;
+      }
     }
-    KALDI_ASSERT(dropout_retention_ > 0.0 && dropout_retention_ <= 1.0);
+    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
   }
 
   void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<DropoutRetention>");
-    WriteBasicType(os, binary, dropout_retention_);
+    WriteToken(os, binary, "<DropoutRate>");
+    WriteBasicType(os, binary, dropout_rate_);
+  }
+
+  std::string Info() const {
+    return std::string("<DropoutRate> ") + ToString(dropout_rate_);
   }
 
   void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
                     CuMatrixBase<BaseFloat> *out) {
     out->CopyFromMat(in);
-    // switch off 50% of the inputs...
+    // set N inputs to zero, according to the 'dropout_rate_' ...
     dropout_mask_.Resize(out->NumRows(), out->NumCols());
-    dropout_mask_.Set(dropout_retention_);
-    rand_.BinarizeProbs(dropout_mask_, &dropout_mask_);
+    rand_.RandUniform(&dropout_mask_);  // [0..1]
+    dropout_mask_.Add(-dropout_rate_);  // [(-rate)..(1-rate)]
+    dropout_mask_.Heaviside(dropout_mask_); // (x > 0.0 ? 1 : 0)
     out->MulElements(dropout_mask_);
-    // rescale to keep same dynamic range as w/o dropout
-    out->Scale(1.0/dropout_retention_);
+    // rescale to keep the same dynamic range as w/o dropout,
+    out->Scale(1.0 / (1.0 - dropout_rate_));
   }
 
   void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
@@ -323,21 +346,24 @@ class Dropout : public Component {
     in_diff->CopyFromMat(out_diff);
     // use same mask on the error derivatives...
     in_diff->MulElements(dropout_mask_);
-    // enlarge output to fit dynamic range w/o dropout
-    in_diff->Scale(1.0/dropout_retention_);
+    // enlarge the output to fit same dynamic range as w/o dropout
+    in_diff->Scale(1.0 / (1.0 - dropout_rate_));
   }
 
-  BaseFloat GetDropoutRetention() { return dropout_retention_; }
+  BaseFloat GetDropoutRate() { return dropout_rate_; }
 
-  void SetDropoutRetention(BaseFloat dr) {
-    dropout_retention_ = dr;
-    KALDI_ASSERT(dropout_retention_ > 0.0 && dropout_retention_ <= 1.0);
+  void SetDropoutRate(BaseFloat dr) {
+    dropout_rate_ = dr;
+    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
   }
 
  private:
-  CuRand<BaseFloat> rand_;
-  CuMatrix<BaseFloat> dropout_mask_;
-  BaseFloat dropout_retention_;
+  BaseFloat dropout_rate_;  ///< probability that a neuron is dropped,
+
+  CuRand<BaseFloat> rand_;  ///< generator of random numbers,
+
+  CuMatrix<BaseFloat> dropout_mask_;  // random binary mask,
+                                      // 1 = keep neuron, 0 = drop neuron,
 };
 
 }  // namespace nnet1
diff --git a/src/nnet/nnet-nnet.cc b/src/nnet/nnet-nnet.cc
index 4b0b565b94c..86c5f9e5ad0 100644
--- a/src/nnet/nnet-nnet.cc
+++ b/src/nnet/nnet-nnet.cc
@@ -265,13 +265,13 @@ void Nnet::SetParams(const VectorBase<BaseFloat>& params) {
   KALDI_ASSERT(pos == NumParams());
 }
 
-void Nnet::SetDropoutRetention(BaseFloat r)  {
+void Nnet::SetDropoutRate(BaseFloat r)  {
   for (int32 c = 0; c < NumComponents(); c++) {
     if (GetComponent(c).GetType() == Component::kDropout) {
       Dropout& comp = dynamic_cast<Dropout&>(GetComponent(c));
-      BaseFloat r_old = comp.GetDropoutRetention();
-      comp.SetDropoutRetention(r);
-      KALDI_LOG << "Setting dropout-retention in component " << c
+      BaseFloat r_old = comp.GetDropoutRate();
+      comp.SetDropoutRate(r);
+      KALDI_LOG << "Setting dropout-rate in component " << c
                 << " from " << r_old << " to " << r;
     }
   }
diff --git a/src/nnet/nnet-nnet.h b/src/nnet/nnet-nnet.h
index a06cf072f5f..cf29f91a89d 100644
--- a/src/nnet/nnet-nnet.h
+++ b/src/nnet/nnet-nnet.h
@@ -123,7 +123,7 @@ class Nnet {
   void SetParams(const VectorBase<BaseFloat>& params);
 
   /// Set the dropout rate
-  void SetDropoutRetention(BaseFloat r);
+  void SetDropoutRate(BaseFloat r);
 
   /// Reset streams in multi-stream training,
   void ResetStreams(const std::vector<int32> &stream_reset_flag);
diff --git a/src/nnetbin/nnet-copy.cc b/src/nnetbin/nnet-copy.cc
index 2567001beb3..c4a27f2dd69 100644
--- a/src/nnetbin/nnet-copy.cc
+++ b/src/nnetbin/nnet-copy.cc
@@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
     bool binary_write = true;
     int32 remove_first_components = 0;
     int32 remove_last_components = 0;
-    BaseFloat dropout_retention = 0.0;
+    BaseFloat dropout_rate = -1.0;
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
@@ -52,8 +52,9 @@ int main(int argc, char *argv[]) {
     po.Register("remove-last-components", &remove_last_components,
         "Remove N last layers Components from the Nnet");
 
-    po.Register("dropout-retention", &dropout_retention,
-        "Set dropout retention to a particular value.");
+    po.Register("dropout-rate", &dropout_rate,
+        "Probability that neuron is dropped"
+        "(-1.0 keeps original value).");
 
     std::string from_parallel_component;
     po.Register("from-parallel-component", &from_parallel_component,
@@ -131,8 +132,8 @@ int main(int argc, char *argv[]) {
     }
 
     // dropout,
-    if (dropout_retention != 0.0) {
-      nnet.SetDropoutRetention(dropout_retention);
+    if (dropout_rate != -1.0) {
+      nnet.SetDropoutRate(dropout_rate);
     }
 
     // store the network,
diff --git a/src/nnetbin/nnet-forward.cc b/src/nnetbin/nnet-forward.cc
index 1a40d03cdf7..062bca7da9d 100644
--- a/src/nnetbin/nnet-forward.cc
+++ b/src/nnetbin/nnet-forward.cc
@@ -109,8 +109,8 @@ int main(int argc, char *argv[]) {
     PdfPrior pdf_prior(prior_opts);
 
     // disable dropout,
-    nnet_transf.SetDropoutRetention(1.0);
-    nnet.SetDropoutRetention(1.0);
+    nnet_transf.SetDropoutRate(0.0);
+    nnet.SetDropoutRate(0.0);
 
     kaldi::int64 tot_t = 0;
 
diff --git a/src/nnetbin/nnet-train-frmshuff.cc b/src/nnetbin/nnet-train-frmshuff.cc
index 58e50074492..07cfb626d9f 100644
--- a/src/nnetbin/nnet-train-frmshuff.cc
+++ b/src/nnetbin/nnet-train-frmshuff.cc
@@ -82,11 +82,6 @@ int main(int argc, char *argv[]) {
     po.Register("use-gpu", &use_gpu,
         "yes|no|optional, only has effect if compiled with CUDA");
 
-    double dropout_retention = 0.0;
-    po.Register("dropout-retention", &dropout_retention,
-        "number between 0..1, controls how many neurons are preserved "
-        "(0.0 will keep the value unchanged)");
-
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
@@ -120,13 +115,9 @@ int main(int argc, char *argv[]) {
     nnet.Read(model_filename);
     nnet.SetTrainOptions(trn_opts);
 
-    if (dropout_retention > 0.0) {
-      nnet_transf.SetDropoutRetention(dropout_retention);
-      nnet.SetDropoutRetention(dropout_retention);
-    }
     if (crossvalidate) {
-      nnet_transf.SetDropoutRetention(1.0);
-      nnet.SetDropoutRetention(1.0);
+      nnet_transf.SetDropoutRate(0.0);
+      nnet.SetDropoutRate(0.0);
     }
 
     kaldi::int64 total_frames = 0;
diff --git a/src/nnetbin/nnet-train-multistream-perutt.cc b/src/nnetbin/nnet-train-multistream-perutt.cc
index 53a98b9b03f..154c7fd9c9d 100644
--- a/src/nnetbin/nnet-train-multistream-perutt.cc
+++ b/src/nnetbin/nnet-train-multistream-perutt.cc
@@ -118,6 +118,11 @@ int main(int argc, char *argv[]) {
     nnet.Read(model_filename);
     nnet.SetTrainOptions(trn_opts);
 
+    if (crossvalidate) {
+      nnet_transf.SetDropoutRate(0.0);
+      nnet.SetDropoutRate(0.0);
+    }
+
     kaldi::int64 total_frames = 0;
 
     // Initialize feature and target readers,
diff --git a/src/nnetbin/nnet-train-multistream.cc b/src/nnetbin/nnet-train-multistream.cc
index bdc2d132d04..7424759f45b 100644
--- a/src/nnetbin/nnet-train-multistream.cc
+++ b/src/nnetbin/nnet-train-multistream.cc
@@ -196,6 +196,11 @@ int main(int argc, char *argv[]) {
     nnet.Read(model_filename);
     nnet.SetTrainOptions(trn_opts);
 
+    if (crossvalidate) {
+      nnet_transf.SetDropoutRate(0.0);
+      nnet.SetDropoutRate(0.0);
+    }
+
     kaldi::int64 total_frames = 0;
 
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
diff --git a/src/nnetbin/nnet-train-perutt.cc b/src/nnetbin/nnet-train-perutt.cc
index 04ee0b97bab..c8695ffa4ff 100644
--- a/src/nnetbin/nnet-train-perutt.cc
+++ b/src/nnetbin/nnet-train-perutt.cc
@@ -114,6 +114,11 @@ int main(int argc, char *argv[]) {
     nnet.Read(model_filename);
     nnet.SetTrainOptions(trn_opts);
 
+    if (crossvalidate) {
+      nnet_transf.SetDropoutRate(0.0);
+      nnet.SetDropoutRate(0.0);
+    }
+
     kaldi::int64 total_frames = 0;
 
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);